In [1]:
!pip install  bitsandbytes transformers peft accelerate datasets trl flash_attn
!pip install wandb



In [2]:
import os
import bitsandbytes as bnb
import pandas as pd
import torch
import torch.nn as nn
import transformers
from datasets import load_dataset, concatenate_datasets
from peft import (
    LoraConfig,
    PeftConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
)
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments
)

from trl import SFTTrainer

In [3]:
from huggingface_hub import login


login(token=HF_Token)




The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [4]:
import wandb

wandb.login(key= WandB_Key)




True

In [5]:
model_id = "meta-llama/Meta-Llama-3-8B"
dataset_name = "wikimedia/wikipedia"
dataset_split = '20231101.en'

In [6]:
dataset_1 = load_dataset(dataset_name, dataset_split)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/41 [00:00<?, ?it/s]

In [7]:
dataset_1 = (dataset_1['train'].select(range(20000)))

In [8]:
dataset_1


Dataset({
    features: ['id', 'url', 'title', 'text'],
    num_rows: 20000
})

In [9]:
print(dataset_1['text'][0])

Anarchism is a political philosophy and movement that is skeptical of all justifications for authority and seeks to abolish the institutions it claims maintain unnecessary coercion and hierarchy, typically including nation-states, and capitalism. Anarchism advocates for the replacement of the state with stateless societies and voluntary free associations. As a historically left-wing movement, this reading of anarchism is placed on the farthest left of the political spectrum, usually described as the libertarian wing of the socialist movement (libertarian socialism).

Humans have lived in societies without formal hierarchies long before the establishment of states, realms, or empires. With the rise of organised hierarchical bodies, scepticism toward authority also rose. Although traces of anarchist ideas are found all throughout history, modern anarchism emerged from the Enlightenment. During the latter half of the 19th and the first decades of the 20th century, the anarchist movement f

In [10]:
# 'model_id' and 'model_name' are the identifiers for the pre-trained model from Hugging Face hub that you want to fine-tune.
model_id = model_id
model_name = model_id



# 'dataset_split' is the split of the dataset that you want to use for fine-tuning. In this case, it is set to 'train', which means that the training split of the dataset will be used.
#dataset_split= "train"

# 'new_model' is the name that you want to give to the fine-tuned model.
new_model = "new-model-name"

# 'hf_model_repo' is the identifier for the Hugging Face repository where you want to save the fine-tuned model.
hf_model_repo="username/"+new_model

# Load Model on GPU

# 'device_map' is a dictionary that maps devices to model parts. In this case, it is set to {"": 0}, which means that the entire model will be loaded on GPU 0.
device_map = {"": 0}

# Bits and Bytes configuration for the model

# 'use_4bit' is a boolean that controls whether 4-bit precision should be used for loading the base model.
use_4bit = True

# 'bnb_4bit_compute_dtype' is the data type that should be used for computations with the 4-bit base model. In this case, it is set to 'bfloat16'.
bnb_4bit_compute_dtype = "bfloat16"

# 'bnb_4bit_quant_type' is the type of quantization that should be used for the 4-bit base model. In this case, it is set to 'nf4'.
bnb_4bit_quant_type = "nf4"

# 'use_double_quant' is a boolean that controls whether nested quantization should be used for the 4-bit base model.
use_double_quant = True

# LoRA configuration for the model

# 'lora_r' is the dimension of the LoRA attention.
lora_r = 64

# 'lora_alpha' is the alpha parameter for LoRA scaling.
lora_alpha = 64

# 'lora_dropout' is the dropout probability for LoRA layers.
lora_dropout = 0.05

# 'target_modules' is a list of the modules that should be targeted by LoRA.
target_modules= ['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"]

In [11]:
tokenizer_id = model_id
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
tokenizer.padding_side = 'right' # to prevent warnings
context_length = 512

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_datasets = dataset_1.map(
    tokenize, batched=True, remove_columns=dataset_1.column_names
)
tokenized_datasets



Dataset({
    features: ['input_ids'],
    num_rows: 50267
})

In [13]:
dataset_chatml = tokenized_datasets.train_test_split(test_size=0.05, seed=1234)
dataset_chatml

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 47753
    })
    test: Dataset({
        features: ['input_ids'],
        num_rows: 2514
    })
})

In [14]:
print(dataset_chatml['train']['input_ids'][0])

[128000, 3092, 20941, 889, 374, 304, 23070, 1210, 320, 35596, 33, 340, 10227, 4751, 364, 6263, 434, 6, 2564, 1618, 706, 279, 4623, 315, 586, 96530, 315, 68078, 311, 10811, 13, 2564, 5810, 279, 12103, 315, 2418, 389, 279, 44721, 48466, 288, 477, 47591, 1274, 1603, 279, 67698, 5590, 1975, 510, 85, 13, 220, 1644, 60, 2030, 39371, 47591, 757, 1603, 1274, 11, 358, 690, 1101, 23973, 1603, 856, 20941, 304, 23070, 13, 2564, 1115, 374, 264, 3831, 10163, 11, 369, 364, 998, 23973, 6, 2564, 1618, 3445, 311, 5790, 9933, 3771, 323, 374, 4221, 315, 39571, 6539, 13, 763, 420, 56529, 21765, 11, 433, 3445, 430, 1274, 26457, 304, 311, 7410, 323, 5790, 9933, 3771, 311, 5766, 27242, 477, 4648, 10246, 19475, 220, 972, 25, 21, 12, 24, 1389, 510, 55065, 374, 12365, 311, 813, 49260, 60, 330, 4071, 39371, 11384, 832, 315, 1521, 2697, 6305, 35090, 304, 2206, 311, 4498, 510, 4991, 17086, 450, 56761, 60, 2345, 275, 1053, 387, 2731, 369, 1461, 430, 264, 1541, 798, 753, 2606, 11046, 387, 18799, 2212, 813, 13272, 323

In [15]:
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True, add_eos_token=True, use_fast=True)
#tokenizer.pad_token = tokenizer.unk_token
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
        load_in_4bit=use_4bit,
        bnb_4bit_quant_type=bnb_4bit_quant_type,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=use_double_quant,
)

model = AutoModelForCausalLM.from_pretrained(
          model_id, torch_dtype=torch.bfloat16, trust_remote_code=True, quantization_config=bnb_config, device_map=device_map,
          attn_implementation='flash_attention_2',
)



model = prepare_model_for_kbit_training(model)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [16]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaFlashAttention2(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): 

In [17]:
args = TrainingArguments(
        output_dir="./llama-qLoRA",
        evaluation_strategy="steps",
        do_eval=True,
        optim="adamw_torch",
        per_device_train_batch_size=16,
        gradient_accumulation_steps=2,
        per_device_eval_batch_size=16,
        log_level="debug",
        save_strategy="epoch",
        logging_steps=10,
        learning_rate=1e-6,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        eval_steps=500,
        num_train_epochs=3,
        warmup_ratio=0.05,
        lr_scheduler_type="inverse_sqrt",
        report_to="wandb",
        seed=42,)

peft_config = LoraConfig(
        r=lora_r,
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        task_type='CAUSAL_LM',
        target_modules=target_modules,)



In [18]:
trainer = SFTTrainer(
        model=model,
        max_seq_length=512,
        train_dataset=dataset_chatml['train'],
        eval_dataset=dataset_chatml['test'],
        peft_config=peft_config,
        tokenizer=tokenizer,
        args=args,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
Using auto half precision backend


In [19]:
trainer.train()

Currently training with a batch size of: 16
***** Running training *****
  Num examples = 47,753
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 2
  Total optimization steps = 4,476
  Number of trainable parameters = 167,772,160
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.


Step,Training Loss,Validation Loss
500,1.9835,2.004669
1000,1.97,1.990098
1500,1.9764,1.983812
2000,1.9714,1.979873
2500,1.9547,1.977024



***** Running Evaluation *****
  Num examples = 2514
  Batch size = 16
We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)

***** Running Evaluation *****
  Num examples = 2514
  Batch size = 16
Saving model checkpoint to ./llama-qLoRA/checkpoint-1492
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B/snapshots/62bd457b6fe961a42a631306577e622c83876cb6/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128001,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 8192,
  "mlp_bias": false,
  "model_type": "ll

Step,Training Loss,Validation Loss
500,1.9835,2.004669
1000,1.97,1.990098
1500,1.9764,1.983812
2000,1.9714,1.979873
2500,1.9547,1.977024
3000,1.981,1.974702
3500,1.9841,1.972887
4000,1.9469,1.971264


Saving model checkpoint to ./llama-qLoRA/checkpoint-2985
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B/snapshots/62bd457b6fe961a42a631306577e622c83876cb6/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128001,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 8192,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 500000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.42.4",
  "use_cache": true,
  "vocab_size": 128256
}

tokenizer config file saved in ./llama-qLoRA/checkpoint-2985/token

TrainOutput(global_step=4476, training_loss=1.9818504881496617, metrics={'train_runtime': 35402.196, 'train_samples_per_second': 4.047, 'train_steps_per_second': 0.126, 'total_flos': 3.375723401887875e+18, 'train_loss': 1.9818504881496617, 'epoch': 2.998994974874372})

# New Section

In [20]:
trainer.save_model()

Saving model checkpoint to ./llama-qLoRA
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B/snapshots/62bd457b6fe961a42a631306577e622c83876cb6/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128001,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 8192,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 500000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.42.4",
  "use_cache": true,
  "vocab_size": 128256
}

tokenizer config file saved in ./llama-qLoRA/tokenizer_config.json
Special tokens 

In [21]:
trainer.push_to_hub("dhanishetty/llama_adapters" #, token= "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"
                    )

Saving model checkpoint to ./llama-qLoRA
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B/snapshots/62bd457b6fe961a42a631306577e622c83876cb6/config.json
Model config LlamaConfig {
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128001,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 8192,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 500000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.42.4",
  "use_cache": true,
  "vocab_size": 128256
}

tokenizer config file saved in ./llama-qLoRA/tokenizer_config.json
Special tokens 

adapter_model.safetensors:   0%|          | 0.00/671M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/dhanishetty/llama-qLoRA/commit/a35a0224fbbaaf0128c204d372631ad6c92e29c0', commit_message='dhanishetty/llama_adapters', commit_description='', oid='a35a0224fbbaaf0128c204d372631ad6c92e29c0', pr_url=None, pr_revision=None, pr_num=None)