In [2]:
!pip install wandb==0.14.2
!pip install datasets==2.8.0
!pip install peft==0.4.0
# !pip install transformers==4.30.1
!pip install git+https://github.com/huggingface/transformers.git@ccb92be
!pip install accelerate==0.20.3
# !pip instball bitsandbytes==0.39.0
!pip install bitsandbytes==0.41.1

Collecting wandb==0.14.2
  Downloading wandb-0.14.2-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb==0.14.2)
  Downloading GitPython-3.1.35-py3-none-any.whl (188 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.8/188.8 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb==0.14.2)
  Downloading sentry_sdk-1.30.0-py2.py3-none-any.whl (218 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m218.8/218.8 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker-pycreds>=0.4.0 (from wandb==0.14.2)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting pathtools (from wandb==0.14.2)
  Downloading pathtools-0.1.2.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting setproctitle (from wandb==0.14.2)
  Downloading setproctit

# Model Inference

In [3]:
import peft
import torch
from peft import PeftConfig, PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser
import shutil

# Load in model
model = AutoModelForCausalLM.from_pretrained(
    "gmongaras/wizardLM-7B-HF-8bit",
    return_dict=True,
    torch_dtype=torch.float16,
    load_in_8bit=True,
    trust_remote_code=True,
    device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained("gmongaras/wizardLM-7B-HF-8bit")
model = model.eval()

Downloading (…)lve/main/config.json:   0%|          | 0.00/980 [00:00<?, ?B/s]

You passed `quantization_config` to `from_pretrained` but the model you're loading already has a `quantization_config` attribute. The `quantization_config` attribute will be overwritten with the one you passed to `from_pretrained`.


Downloading pytorch_model.bin:   0%|          | 0.00/7.01G [00:00<?, ?B/s]



Downloading (…)neration_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

In [6]:
# Prompt should be in this style due to how the data was created
prompt = "#### Human: What is the capital of Australia?#### Assistant:"
temperature = 0.95


device = "auto"
inputs = tokenizer(prompt, return_tensors="pt")
if device != "cpu":
    inputs = inputs.to('cuda')
# del inputs['token_type_ids']
output = model.generate(**inputs, do_sample=True, top_p=temperature, top_k=60, max_new_tokens=100)
output = tokenizer.decode(output[0], skip_special_tokens=True)


print(output.split("#### Assistant: ")[1])

The capital of Australia is Canberra.


# Model Finetuning

In [None]:
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
)



max_length = 128


# Model loading params
load_in_4bit = True

# LoRA Params
lora_alpha = 16             # How much to weigh LoRA params over pretrained params
lora_dropout = 0.1          # Dropout for LoRA weights to avoid overfitting
lora_r = 16                 # Bottleneck size between A and B matrix for LoRA params
lora_bias = "all"           # "all" or "none" for LoRA bias
model_type = "wizard"        # falcon or llama or wizard
lora_target_modules = [     # Which modules to apply LoRA to (names of the modules in state_dict)
    "query_key_value",
    "dense",
    "dense_h_to_4h",
    "dense_4h_to_h",
] if model_type == "falcon" else [
    "q_proj",
    "k_proj",
    "v_proj",
    "o_proj",
    "gate_proj",
    "up_proj",
    "down_proj"
]

# Trainer params
output_dir = "outputs"                              # Directory to save the model
optim_type = "adamw_8bit"                           # Optimizer type to train with
learning_rate = 0.0005                              # Model learning rate
weight_decay = 0.002                                # Model weight decay
per_device_train_batch_size = 3                     # Train batch size on each GPU
per_device_eval_batch_size = 1                      # Eval batch size on each GPU
gradient_accumulation_steps = 16                    # Number of steps before updating model
warmup_steps = 5                                    # Number of warmup steps for learning rate
save_steps = 25                                    # Number of steps before saving model
logging_steps = 25                                 # Number of steps before logging
num_steps = 100                                   # Number of steps to finetune for




# Stupid cache
torch.cuda.empty_cache()




# Load in the model as a 4-bit or 8-bit model
if load_in_4bit == True:
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype="bfloat16",
        bnb_4bit_use_double_quant=True,
    )
    model = AutoModelForCausalLM.from_pretrained(
        # "tiiuae/falcon-7b" if model_type == "falcon" else "meta-llama/Llama-2-7b-hf",
        # "daryl149/llama-2-7b-hf",
        # "WizardLM/WizardMath-7B-V1.0",
        # "vivekraina/Llama-2-7b-hf-8bit",
        "gmongaras/wizardLM-7B-HF-8bit",
        trust_remote_code=True,
        device_map="auto",
        quantization_config=bnb_config,
        cache_dir="model/",
    )
else:
    model = AutoModelForCausalLM.from_pretrained(
        # "tiiuae/falcon-7b" if model_type == "falcon" else "meta-llama/Llama-2-7b-hf",
        # "daryl149/llama-2-7b-hf",
        # "WizardLM/WizardMath-7B-V1.0",
        # "vivekraina/Llama-2-7b-hf-8bit",
        "gmongaras/wizardLM-7B-HF-8bit",
        trust_remote_code=True,
        device_map="auto",
        load_in_8bit=True,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
        cache_dir="model/",
    )



# Load in the tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    # "tiiuae/falcon-7b" if model_type == "falcon" else "meta-llama/Llama-2-7b-hf",
    # "daryl149/llama-2-7b-hf",
    # "WizardLM/WizardMath-7B-V1.0",
    # "vivekraina/Llama-2-7b-hf-8bit",
    "gmongaras/wizardLM-7B-HF-8bit",
    trust_remote_code=True,
    cache_dir="model/",
)
tokenizer.pad_token = tokenizer.eos_token



# Load in the dataset and map using the tokenizer
dataset = load_dataset("squad")
"""
The dataset has context, questions, and answers.

For this example, I am just encoding the question and first answer.
when you would actually want the context and question.

We want the text string to be in the format
#### Human: {question}#### Assistant: {output}

We want to turn this into the format:
{
    "input_ids": input ids for the encoded instruction and input
    "labels": This is the input ids, but we put -100 where we want to mask the
                loss. We want to mask the loss for the instruction, input, and padding.
                We use -100 because PyTorch CrossEntropy ignores -100 labels.
    "attention_mask": attention mask so the model doesn't attend to padding
}
"""
def map_function(example):
    # Get the question and model output
    question = f"#### Human: {example['question'].strip()}"
    output = f"#### Assistant: {example['answers']['text'][0].strip()}"

    # Encode the question and output
    question_encoded = tokenizer(question)
    output_encoded = tokenizer(output, max_length=max_length-len(question_encoded["input_ids"]), truncation=True, padding="max_length")

    # Combine the input ids
    input_ids = question_encoded["input_ids"] + output_encoded["input_ids"]

    # The labels are the input ids, but we want to mask the loss for the context and padding
    labels = [-100]*len(question_encoded["input_ids"]) + [output_encoded["input_ids"][i] if output_encoded["attention_mask"][i] == 1 else -100 for i in range(len(output_encoded["attention_mask"]))]

    # Combine the attention masks. Attention masks are 0
    # where we want to mask and 1 where we want to attend.
    # We want to attend to both context and generated output
    attention_mask = [1]*len(question_encoded["input_ids"]) + output_encoded["attention_mask"]

    return {
        "input_ids": input_ids,
        "labels": labels,
        "attention_mask": attention_mask
    }
data_train = dataset["train"].map(map_function)
data_test = dataset["validation"].map(map_function)

# NEW



# OLD
# Adapt the model with LoRA weights
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias=lora_bias,
    task_type="CAUSAL_LM",
    inference_mode=False,
    target_modules=lora_target_modules
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()


training_args = TrainingArguments(
    output_dir=output_dir,
    optim=optim_type,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    per_device_train_batch_size=per_device_train_batch_size,
    # per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    do_train=True,
    warmup_steps=warmup_steps,
    save_steps=save_steps,
    logging_steps=logging_steps,
    # evaluation_strategy="epoch",
    evaluation_strategy="no", # Not enough memory to eval on colab
    do_eval=False,
    max_steps=num_steps,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data_train,
    # eval_dataset=data_test,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

Downloading (…)lve/main/config.json:   0%|          | 0.00/980 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/7.01G [00:00<?, ?B/s]



Downloading (…)neration_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.27k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.67k [00:00<?, ?B/s]

Downloading and preparing dataset squad/plain_text to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Dataset squad downloaded and prepared to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/87599 [00:00<?, ?ex/s]

  0%|          | 0/10570 [00:00<?, ?ex/s]

trainable params: 39,976,960 || all params: 6,778,400,768 || trainable%: 0.5897697903718874


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
25,2.2818
50,0.9572
75,0.9515
100,0.9362


TrainOutput(global_step=100, training_loss=1.2816693878173828, metrics={'train_runtime': 2598.7767, 'train_samples_per_second': 1.847, 'train_steps_per_second': 0.038, 'total_flos': 2.45046976708608e+16, 'train_loss': 1.2816693878173828, 'epoch': 0.05})

In [None]:
try:
  del data_train, data_test, tokenizer, dataset
  del trainer, model
except NameError:
  pass
torch.cuda.empty_cache()
import gc
gc.collect()

0

In [None]:
import os
os.kill(os.getpid(), 9)

In [None]:
import peft
import torch
from peft import PeftConfig, PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser
import shutil
from bitsandbytes.nn import Int8Params, Linear8bitLt
from copy import deepcopy


import os
best = max([int(i.split("-")[-1]) for i in os.listdir("outputs") if i.startswith("checkpoint")])
lora_path = f"outputs/checkpoint-{best}" # Path to the LoRA weights
output_path = "outputs/merged_model" # Path to output the merged weights




peft_model_id = lora_path
peft_config = PeftConfig.from_pretrained(peft_model_id)
pth = os.listdir("model/models--gmongaras--wizardLM-7B-HF-8bit/snapshots/")[0]
peft_config.base_model_name_or_path = f"model/models--gmongaras--wizardLM-7B-HF-8bit/snapshots/{pth}"
model = AutoModelForCausalLM.from_pretrained(
    peft_config.base_model_name_or_path,
    return_dict=True,
    torch_dtype=torch.float16,
    load_in_8bit=True,
    trust_remote_code=True,
    device_map="cpu",
)
tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path)

shutil.copytree(peft_config.base_model_name_or_path, output_path, dirs_exist_ok=True, ignore=shutil.ignore_patterns('*.pt', "*.pth", "*.bin"))

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id)
model.eval()

key_list = [key for key, _ in model.base_model.model.named_modules() if "lora" not in key]
for key in key_list:
    try:
        sub_mod = model.base_model.model.get_submodule(key)
        parent = model.base_model.model.get_submodule(".".join(key.split(".")[:-1]))
    except AttributeError:
        continue
    target_name = key.split(".")[-1]
    if isinstance(sub_mod, peft.tuners.lora.Linear) or isinstance(sub_mod, peft.tuners.lora.Linear8bitLt):
        # sub_mod.merge()

        if isinstance(sub_mod, peft.tuners.lora.Linear8bitLt):
            layer = deepcopy(model.base_model.model.get_submodule(key))
            layer.weight
            layer.merged = False
            layer.fan_in_fan_out = False



            def get_delta_weight(layer, adapter) -> torch.Tensor:
                from peft.utils.other import transpose
                A_weight = layer.lora_A[adapter].weight
                A_weight = Int8Params(data=A_weight, requires_grad=False).to(layer.weight.device)
                B_weight = layer.lora_B[adapter].weight
                B_weight = Int8Params(data=B_weight, requires_grad=False).to(layer.weight.device)
                # return (
                #     transpose(
                #         layer.lora_B[adapter].weight @ layer.lora_A[adapter].weight,
                #         layer.fan_in_fan_out,
                #     )
                #     * layer.scaling[adapter]
                # )
                return (
                    transpose(
                        B_weight @ A_weight,
                        layer.fan_in_fan_out,
                    )
                    * layer.scaling[adapter]
                )

            def merge(layer) -> None:
                import warnings
                if layer.active_adapter not in layer.lora_A.keys():
                    return
                if layer.merged:
                    warnings.warn("Already merged. Nothing to do.")
                    return
                if layer.r[layer.active_adapter] > 0:
                    new_weight = get_delta_weight(layer, layer.active_adapter)
                    new_weight = Int8Params(new_weight)
                    layer.weight.data = layer.weight.data + new_weight
                    layer.merged = True

            merge(layer)

            bias = sub_mod.bias is not None
            # new_module = torch.nn.Linear(sub_mod.in_features, sub_mod.out_features, bias=bias)
            new_module = Linear8bitLt(layer.weight.shape[0], layer.weight.shape[1]).to(layer.weight.device)
            new_module.weight = layer.weight
            # new_module.weight.data = sub_mod.weight.float()
            if bias:
                new_module.bias.data = sub_mod.bias
            model.base_model._replace_module(parent, target_name, new_module, sub_mod)
        else:
            sub_mod.merge()

            bias = sub_mod.bias is not None
            new_module = torch.nn.Linear(sub_mod.in_features, sub_mod.out_features, bias=bias)
            new_module.weight.data = sub_mod.weight.float()
            if bias:
                new_module.bias.data = sub_mod.bias
            model.base_model._replace_module(parent, target_name, new_module, sub_mod)

model = model.base_model.model

# Save the model
model.save_pretrained(output_path, save_in_8bit=True)

You passed `quantization_config` to `from_pretrained` but the model you're loading already has a `quantization_config` attribute. The `quantization_config` attribute will be overwritten with the one you passed to `from_pretrained`.


In [None]:
try:
  del model, tokenizer
except NameError:
  pass
torch.cuda.empty_cache()
import gc
gc.collect()

5863

In [None]:
import os
os.kill(os.getpid(), 9)

In [None]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
)






device = "auto"
model_path = "outputs/merged_model"             # Path to the combined weights





bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype="bfloat16",
        bnb_4bit_use_double_quant=True,
    )
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    trust_remote_code=True,
    device_map=device,
    # load_in_8bit=True,
    quantization_config=bnb_config
)
tokenizer = AutoTokenizer.from_pretrained(model_path)

Some weights of the model checkpoint at outputs/merged_model were not used when initializing LlamaForCausalLM: ['model.layers.1.mlp.down_proj.bias', 'model.layers.9.mlp.gate_proj.bias', 'model.layers.23.mlp.gate_proj.bias', 'model.layers.24.self_attn.q_proj.bias', 'model.layers.5.self_attn.q_proj.bias', 'model.layers.24.self_attn.v_proj.bias', 'model.layers.23.self_attn.k_proj.bias', 'model.layers.25.self_attn.k_proj.bias', 'model.layers.28.mlp.down_proj.bias', 'model.layers.27.self_attn.o_proj.bias', 'model.layers.26.mlp.down_proj.bias', 'model.layers.16.mlp.up_proj.bias', 'model.layers.18.mlp.up_proj.bias', 'model.layers.22.self_attn.k_proj.bias', 'model.layers.4.self_attn.o_proj.bias', 'model.layers.7.self_attn.o_proj.bias', 'model.layers.29.self_attn.k_proj.bias', 'model.layers.25.self_attn.q_proj.bias', 'model.layers.29.mlp.up_proj.bias', 'model.layers.20.self_attn.k_proj.bias', 'model.layers.6.mlp.up_proj.bias', 'model.layers.30.mlp.down_proj.bias', 'model.layers.30.self_attn.o_p

In [None]:
#### No merging
# import peft
# import torch
# from peft import PeftConfig, PeftModel
# from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser
# import shutil


# import os
# best = max([int(i.split("-")[-1]) for i in os.listdir("outputs") if i.startswith("checkpoint")])
# lora_path = f"outputs/checkpoint-{best}" # Path to the LoRA weights
# output_path = "outputs/merged_model" # Path to output the merged weights




# peft_model_id = lora_path
# peft_config = PeftConfig.from_pretrained(peft_model_id)
# pth = os.listdir("model/models--gmongaras--wizardLM-7B-HF-8bit/snapshots/")[0]
# peft_config.base_model_name_or_path = f"model/models--gmongaras--wizardLM-7B-HF-8bit/snapshots/{pth}"
# model = AutoModelForCausalLM.from_pretrained(
#     peft_config.base_model_name_or_path,
#     return_dict=True,
#     torch_dtype=torch.float16,
#     load_in_8bit=True,
#     trust_remote_code=True,
#     device_map="auto",
# )
# tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path)
# model = model.eval()

In [None]:
# Prompt should be in this style due to how the data was created
prompt = "#### Human: What is the capital of Australia?#### Assistant:"

inputs = tokenizer(prompt, return_tensors="pt")
if device != "cpu":
    inputs = inputs.to('cuda')
# del inputs['token_type_ids']
output = model.generate(**inputs, do_sample=True, top_p=0.95, top_k=60, max_new_tokens=100)
output = tokenizer.decode(output[0], skip_special_tokens=True)


print(output.split("#### Assistant: ")[1])

The capital of Australia is Canberra.


In [None]:
# Save model to drive
pth = '/content/gdrive/My Drive/Llama2_finetuned'
from google.colab import drive
drive.mount('/content/gdrive')
%cd ~/
import os.path
from os import path
if path.exists(pth) == False:
  os.makedirs(pth)
import shutil
shutil.copytree("/content/outputs/", pth, dirs_exist_ok=True)#, ignore=shutil.ignore_patterns('*.pt', "*.pth", "*.bin"))

Mounted at /content/gdrive
/root


'/content/gdrive/My Drive/Llama2_finetuned'