In [None]:
!pip install accelerate peft bitsandbytes transformers trl


In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("messages.csv", sep="|", encoding="utf-8")

In [3]:
def formatted_train(input, response)->str:
    return f"<|im_start|>user\n{input}<|im_end|>\n<|im_start|>assistant\n{response}<|im_end|>\n"

In [4]:
# pair messages
formatted_inputs = []
for i in range(len(df) - 1):
    input_text = df.iloc[i]["Message"]
    response_text = df.iloc[i + 1]["Message"]
    formatted_inputs.append(formatted_train(input_text, response_text))

input_text_last = df.iloc[-1]["Message"]
response_text_last = ""
formatted_inputs.append(formatted_train(input_text_last, response_text_last))
df["Formatted Input"] = formatted_inputs

In [6]:
import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig, AutoPeftModelForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer
import os

In [7]:
def get_model_and_tokenizer(mode_id="meta-llama/Llama-2-7b-chat-hf"):

    tokenizer = AutoTokenizer.from_pretrained(mode_id)
    tokenizer.pad_token = tokenizer.eos_token
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype="float16", bnb_4bit_use_double_quant=True
    )
    model = AutoModelForCausalLM.from_pretrained(
        mode_id, quantization_config=bnb_config, device_map="auto"
    )
    model.config.use_cache=False
    model.config.pretraining_tp=1
    # we need add those in order to finetune
    lora_config = LoraConfig( 
        target_modules=["q_proj", "k_proj"],
        init_lora_weights=False
    )

    model.add_adapter(lora_config, adapter_name="adapter_1")

    return model, tokenizer

In [None]:
#from huggingface_hub import notebook_login
#notebook_login()

In [None]:
model, tokenizer = get_model_and_tokenizer()

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
train_data, valid_data = train_test_split(df, test_size=0.2)

In [11]:
from datasets import Dataset

In [12]:
train_data = Dataset.from_pandas(train_data).remove_columns('__index_level_0__')
valid_data = Dataset.from_pandas(valid_data).remove_columns('__index_level_0__')
train_data, valid_data

(Dataset({
     features: ['Sender', 'Message', 'Formatted Input'],
     num_rows: 75832
 }),
 Dataset({
     features: ['Sender', 'Message', 'Formatted Input'],
     num_rows: 18959
 }))

In [13]:
peft_config = LoraConfig(
        r=8, lora_alpha=16, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM"
    )

In [14]:
output_model="tinyllama-Clone-v1"
training_arguments = TrainingArguments(
        output_dir=output_model,
        per_device_train_batch_size=8,
        gradient_accumulation_steps=4,
        optim="paged_adamw_32bit",
        learning_rate=1.5e-4,
        lr_scheduler_type="cosine",
        save_strategy="epoch",
        logging_steps=10,
        num_train_epochs=3,
        fp16=True,
        max_steps=120,
        # push_to_hub=True
    )

In [15]:
trainer = SFTTrainer(
        model=model,
        train_dataset=train_data,
        eval_dataset=valid_data,
        dataset_text_field="Formatted Input",
        args=training_arguments,
        tokenizer=tokenizer,
        packing=False,
        max_seq_length=128
    )

Map:   0%|          | 0/75832 [00:00<?, ? examples/s]

Map:   0%|          | 0/18959 [00:00<?, ? examples/s]

In [16]:
trainer.train()

Step,Training Loss
10,3.4503
20,2.5322
30,2.1523
40,1.9985
50,1.989
60,1.8689
70,1.8348
80,1.7768
90,1.7611
100,1.7607




TrainOutput(global_step=120, training_loss=2.0549012184143067, metrics={'train_runtime': 614.0791, 'train_samples_per_second': 6.253, 'train_steps_per_second': 0.195, 'total_flos': 1.260973669023744e+16, 'train_loss': 2.0549012184143067, 'epoch': 0.05})

In [17]:
trainer.evaluate()

{'eval_loss': 1.768835425376892,
 'eval_runtime': 1527.2635,
 'eval_samples_per_second': 12.414,
 'eval_steps_per_second': 1.552,
 'epoch': 0.05}

In [1]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
from peft import AutoPeftModelForCausalLM, PeftModel
from transformers import AutoModelForCausalLM
import torch
import os
#model_id="TinyLlama/TinyLlama-1.1B-Chat-v1.0"
model_id="meta-llama/Llama-2-7b-chat-hf"
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, load_in_8bit=False,
                                             device_map="auto",
                                             trust_remote_code=True)

model_path = "/content/tinyllama-Clone-v1/checkpoint-120"

peft_model = PeftModel.from_pretrained(model, model_path, from_transformers=True, device_map="auto",
                                       offload_dir="/content/offloaded_params")

model = peft_model.merge_and_unload()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
from transformers import GenerationConfig
from time import perf_counter

def generate_response(user_input, max_length=128*2):

  prompt = formatted_prompt(user_input)

  inputs = tokenizer([prompt], return_tensors="pt")
  generation_config = GenerationConfig(
      penalty_alpha=0.6,
      do_sample = True,
      top_k=5,
      temperature=0.3,
      repetition_penalty=1.2,
      max_new_tokens=12,
      max_length=max_length,
      pad_token_id=tokenizer.eos_token_id
  )
  start_time = perf_counter()

  inputs = tokenizer(prompt, return_tensors="pt").to('cuda')

  outputs = model.generate(**inputs, generation_config=generation_config)
  print(tokenizer.decode(outputs[0], skip_special_tokens=True))
  output_time = perf_counter() - start_time
  print(f"Time taken for inference: {round(output_time,2)} seconds")

In [4]:
def formatted_prompt(question)-> str:
    return f"<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant:"

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

In [19]:
generate_response(user_input='What did You do?')

<|im_start|>user
What did You do?<|im_end|>
<|im_start|>assistant: I am so sorry	I am so angry with my mom
Time taken for inference: 0.75 seconds
