In [1]:
!pip install accelerate peft transformers trl torch
!pip install bitsandbytes>=0.37.0

Collecting accelerate
  Downloading accelerate-0.26.1-py3-none-any.whl (270 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.7.1-py3-none-any.whl (168 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.3/168.3 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Collecting trl
  Downloading trl-0.7.10-py3-none-any.whl (150 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.9/150.9 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
Collecting datasets (from trl)
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tyro>=0.5.11 (from trl)
  Downloading tyro-0.7.0-py3-none-any.whl (79 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.7/79.7 kB[0m [31m9.2 MB/s[0m eta [36m0:00:

In [2]:
import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig, AutoPeftModelForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, AutoConfig
from trl import SFTTrainer
import os

In [3]:
dataset="burkelibbey/colors"
model_id="TinyLlama/TinyLlama-1.1B-Chat-v1.0"
output_model="tinyllama-colorist-v1"

**Setting up the Data**

In [4]:
def formatted_train(input,response)->str:
    return f"<|im_start|>user\n{input}<|im_end|>\n<|im_start|>assistant\n{response}<|im_end|>\n"

In [5]:
def prepare_train_data(dataset):
  data = load_dataset(dataset, split="train")
  data_df = data.to_pandas()
  data_df["text"] = data_df[["description", "color"]].apply(lambda x: "<|im_start|>user\n" + x["description"] + " <|im_end|>\n<|im_start|>assistant\n" + x["color"] + "<|im_end|>\n", axis=1)
  data = Dataset.from_pandas(data_df)
  return data

In [6]:
data = prepare_train_data(dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.38M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [7]:
data

Dataset({
    features: ['color', 'description', 'text'],
    num_rows: 33887
})

In [8]:
data[1]

{'color': '#000010',
 'description': 'Extremely dark blue: This is such a dark shade of blue, it is almost indistinguishable from black.',
 'text': '<|im_start|>user\nExtremely dark blue: This is such a dark shade of blue, it is almost indistinguishable from black. <|im_end|>\n<|im_start|>assistant\n#000010<|im_end|>\n'}

**Model** (Not the base model)

In [9]:
def get_model_and_tokenizer(model_id):

    tokenizer = AutoTokenizer.from_pretrained(model_id)

    tokenizer.pad_token = tokenizer.eos_token

    bnb_config = BitsAndBytesConfig(#load_in_8bit=True,
                                    bnb_4bit_quant_type="nf4",
                                    bnb_4bit_compute_dtype="float16",
                                    bnb_4bit_use_double_quant=True)

    model = AutoModelForCausalLM.from_pretrained(model_id,
                                                 load_in_4bit=True,
                                                 quantization_config=bnb_config,
                                                 device_map="auto")
    model.config.use_cache=False
    model.config.pretraining_tp=1
    model._is_quantized_training_enabled = True

    return model, tokenizer

In [10]:
model, tokenizer = get_model_and_tokenizer(model_id)

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Setting **LoRA** configuration

In [11]:
peft_config = LoraConfig(r=8,
                         lora_alpha=16,
                         lora_dropout=0.05,
                         bias="none",
                         task_type="CAUSAL_LM")

In [12]:
training_arguments = TrainingArguments( output_dir=output_model,
                                        per_device_train_batch_size=16,
                                        gradient_accumulation_steps=4,
                                        optim="paged_adamw_32bit",
                                        learning_rate=2e-4,
                                        lr_scheduler_type="cosine",
                                        save_strategy="epoch",
                                        logging_steps=10,
                                        num_train_epochs=3,
                                        max_steps=250,
                                        fp16=True,
                                        # push_to_hub=True
                                    )

In [13]:
trainer = SFTTrainer( model=model,
                      train_dataset=data,
                      peft_config=peft_config,
                      dataset_text_field="text",
                      args=training_arguments,
                      tokenizer=tokenizer,
                      packing=False,
                      max_seq_length=1024
                  )

Map:   0%|          | 0/33887 [00:00<?, ? examples/s]

In [15]:
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10,2.326
20,1.6394
30,1.2639
40,1.089
50,1.0106
60,0.981
70,0.9435
80,0.9426
90,0.9443
100,0.9202


TrainOutput(global_step=250, training_loss=1.019126491546631, metrics={'train_runtime': 628.3951, 'train_samples_per_second': 25.462, 'train_steps_per_second': 0.398, 'total_flos': 1.0256900235853824e+16, 'train_loss': 1.019126491546631, 'epoch': 0.47})

Merging the **LoRA** with the base model

In [16]:
from peft import AutoPeftModelForCausalLM, PeftModel
from transformers import AutoModelForCausalLM
import torch
import os

model = AutoModelForCausalLM.from_pretrained(model_id,
                                             torch_dtype=torch.float16,
                                             load_in_8bit=False,
                                             device_map="auto",
                                             trust_remote_code=True)

model_path = "/content/tinyllama-colorist-v1/checkpoint-250"


peft_model = PeftModel.from_pretrained(model, model_path, from_transformers=True, device_map="auto")

model = peft_model.merge_and_unload()



In [17]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head)

Inference from the **Fine tune Llama ***

In [18]:
from transformers import GenerationConfig
from time import perf_counter

def generate_response(user_input):

  prompt = formatted_prompt(user_input)

  inputs = tokenizer([prompt], return_tensors="pt")

  generation_config = GenerationConfig(penalty_alpha=0.6,do_sample = True,
      top_k=5,temperature=0.5,repetition_penalty=1.2,
      max_new_tokens=12,pad_token_id=tokenizer.eos_token_id
  )

  start_time = perf_counter()

  inputs = tokenizer(prompt, return_tensors="pt").to('cuda')

  outputs = model.generate(**inputs, generation_config=generation_config)
  print(tokenizer.decode(outputs[0], skip_special_tokens=True))

  output_time = perf_counter() - start_time
  print(f"Time taken for inference: {round(output_time,2)} seconds")

In [19]:
def formatted_prompt(question)-> str:
    return f"<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant:"

In [20]:
def print_color_space(hex_color):
    def hex_to_rgb(hex_color):
        hex_color = hex_color.lstrip('#')
        return tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))
    r, g, b = hex_to_rgb(hex_color)
    print(f'{hex_color}: \033[48;2;{r};{g};{b}m           \033[0m')

In [21]:
generate_response(user_input='Light Orange color')

<|im_start|>user
Light Orange color<|im_end|>
<|im_start|>assistant: #ffd08a<|im_end|
Time taken for inference: 1.94 seconds


Validate the response

In [24]:
print_color_space('#ffd08a')

#ffd08a: [48;2;255;208;138m           [0m


In [25]:
generate_response(user_input='Drak Green color')

<|im_start|>user
Drak Green color<|im_end|>
<|im_start|>assistant: #108933<|im_end
Time taken for inference: 0.77 seconds


Validate the response

In [26]:
print_color_space('#108933')

#108933: [48;2;16;137;51m           [0m
