# Install

In [2]:
!pip install -q -U bitsandbytes
# !pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install transformers==4.31 #temporary fix required owing to breaking changes on Aug 9th 2023
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
# !pip install -q datasets

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [3]:
from datasets import load_dataset


In [23]:
# Required when training models/data that are gated on HuggingFace, and required for pushing models to HuggingFace
!pip install huggingface_hub
from huggingface_hub import notebook_login

notebook_login()



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Load the model to use: Llama-7B!

In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "meta-llama/Llama-2-7b-chat-hf" ## "Trelis/Llama-2-7b-chat-hf-sharded-bf16" is an alternative if you don't have access via Meta on HuggingFace
# model_id = "meta-llama/Llama-2-13b-chat-hf"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



# Training Setup

Then we have to apply some preprocessing to the model to prepare it for training. For that use the `prepare_model_for_kbit_training` method from PEFT.

In [6]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [7]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [8]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8,
    lora_alpha=32,
    # target_modules=["query_key_value"],
    target_modules=["self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj", "self_attn.o_proj"], #specific to Llama models.
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 8388608 || all params: 3508801536 || trainable%: 0.23907331075678143


# Data Setup

Load a common dataset, english quotes, to fine tune our model on famous quotes.

In [9]:
!pip install jq langchain



In [10]:
!

In [12]:
'''from langchain.document_loaders import JSONLoader
import json
loader = JSONLoader(
    file_path='D:/folder/test_webmd_squad_v2_consec.json',
    jq_schema='.data[]',
    text_content=False)

data = loader.load()
    #documents = loader.load()
'''
from pathlib import Path
import json
file_path=r'/content/output.json'
data = load_dataset("json", data_files=file_path)
data = data.map(lambda elem : tokenizer("question : " + elem["question"] + " Answer : " +  elem["answer"]))

Map:   0%|          | 0/30414 [00:00<?, ? examples/s]

# Training

Run the cell below to run the training! For the sake of the demo, we just ran it for few steps just to showcase how to use this integration with existing tools on the HF ecosystem.

In [13]:
import transformers

# needed for Llama tokenizer
tokenizer.pad_token = tokenizer.eos_token # </s>

trainer = transformers.Trainer(
    model=model,
    train_dataset=data["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=10,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,2.9809
2,2.4181
3,2.0007
4,2.3442
5,2.4787
6,2.295
7,1.8479
8,2.1199
9,2.1025
10,1.9923


TrainOutput(global_step=10, training_loss=2.2580220341682433, metrics={'train_runtime': 61.1814, 'train_samples_per_second': 0.654, 'train_steps_per_second': 0.163, 'total_flos': 86780627238912.0, 'train_loss': 2.2580220341682433, 'epoch': 0.0})

# Inference

In [14]:
from transformers import TextStreamer
model.config.use_cache = True
model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=0)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear4bit(
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
              )
              (k_proj): Linear4bit(
                (lora_dropout): Module

In [15]:
# Define a stream *without* function calling capabilities
def stream(user_prompt):
    runtimeFlag = "cuda:0"
    system_prompt = 'You are a helpful assistant that provides accurate and concise responses'

    B_INST, E_INST = "[INST]", "[/INST]"
    B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"

    prompt = f"{B_INST} {B_SYS}{system_prompt.strip()}{E_SYS}{user_prompt.strip()} {E_INST}\n\n"

    inputs = tokenizer([prompt], return_tensors="pt").to(runtimeFlag)

    streamer = TextStreamer(tokenizer)

    # Despite returning the usual output, the streamer will also print the generated text to stdout.
    _ = model.generate(**inputs, streamer=streamer, max_new_tokens=500)

In [16]:
stream('what are the tips in managing my bipolar disease?')

<s> [INST] <<SYS>>
You are a helpful assistant that provides accurate and concise responses
<</SYS>>

what are the tips in managing my bipolar disease? [/INST]

Managing bipolar disorder requires a comprehensive approach that includes medication, therapy, and lifestyle changes. Here are some tips to help you manage your bipolar disease:

1. Stick to your medication regimen: Medications such as mood stabilizers, antipsychotics, and antidepressants can help manage symptoms of bipolar disorder. It's essential to take your medication as prescribed by your doctor, even if you're feeling well.
2. Attend therapy sessions: Cognitive-behavioral therapy (CBT) and interpersonal therapy (IPT) are two effective therapies for managing bipolar disorder. These therapies can help you identify and change negative thought patterns, improve coping skills, and develop healthy relationships.
3. Practice self-care: Engage in activities that help you relax and reduce stress, such as yoga, meditation, or deep 

# Push Model to Hub

In [19]:
!zip -r v1_panther_lm panther

  adding: panther/ (stored 0%)
  adding: panther/v1_bro_v1/ (stored 0%)
  adding: panther/v1_bro_v1/Llama-2-7b-chat-hf-fine-tuned-adapters/ (stored 0%)
  adding: panther/v1_bro_v1/Llama-2-7b-chat-hf-fine-tuned-adapters/adapter_model.bin (deflated 7%)
  adding: panther/v1_bro_v1/Llama-2-7b-chat-hf-fine-tuned-adapters/README.md (deflated 65%)
  adding: panther/v1_bro_v1/Llama-2-7b-chat-hf-fine-tuned-adapters/adapter_config.json (deflated 50%)
  adding: panther/.ipynb_checkpoints/ (stored 0%)


In [21]:
# Extract the last portion of the base_model
base_model_name = model_id.split("/")[-1]

# Define the save and push paths
adapter_model = f"pranav29/{base_model_name}-fine-tuned-adapters"  #adjust 'Trelis' to your HuggingFace organisation
new_model = f"pranav29/{base_model_name}-fine-tuned" #adjust 'Trelis' to your HuggingFace organisation

In [24]:
# Save the model
model.save_pretrained(adapter_model, push_to_hub=True, use_auth_token=True)

# Push the model to the hub
model.push_to_hub(adapter_model, use_auth_token=True)

adapter_model.bin:   0%|          | 0.00/33.6M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/pranav29/Llama-2-7b-chat-hf-fine-tuned-adapters/commit/f77143798b9925a5d659dd642a857397ffada190', commit_message='Upload model', commit_description='', oid='f77143798b9925a5d659dd642a857397ffada190', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
# reload the base model (you might need a pro subscription for this because you may need a high RAM environment for the 13B model since this is loading the full original model, not quantized)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map='cpu', trust_remote_code=True, torch_dtype=torch.float16, cache_dir=cache_dir)

In [None]:
from peft import PeftModel

# load perf model with new adapters
model = PeftModel.from_pretrained(
    model,
    adapter_model,
)

In [None]:
model = model.merge_and_unload() # merge adapters with the base model.

In [None]:
model.push_to_hub(new_model, use_auth_token=True, max_shard_size="5GB")

In [None]:
#Push the tokenizer
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.push_to_hub(new_model, use_auth_token=True)