In [8]:
import os
# os.environ['CUDA-VISIBLE_DEVICE'] = "0"
import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, BitsAndBytesConfig

from peft import LoraConfig, get_peft_model
import transformers
from datasets import load_dataset

In [11]:
model_name_or_path = "bigscience/bloom-1b7"
tokenizer_name_or_path = "bigscience/tokenizer"

quantization_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True)

device_map = {
    "transformer.word_embeddings": 0,
    "transformer.word_embeddings_layernorm": 0,
    "lm_head": "cpu",
    "transformer.h": 0,
    "transformer.ln_f": 0,
}

model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path,
    # load_in_8bit=True,
    quantization_config=quantization_config,
    # device_map=device_map,
    device_map='auto'
)

tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)

Downloading (…)lve/main/config.json:   0%|          | 0.00/715 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

ValueError: Need either a `state_dict` or a `save_folder` containing offloaded weights.

In [None]:
for param in model.parameters():
    # freeze the model, train adapters later
    param.requires_grad = False
    if param.ndim == 1:
        # cast the small parameters (e.g. layernorm) to fp32 for stability
        param.data = param.data.to(torch.float32)

# reduce number of stored activations
model.gradient_checkpoint_enable()
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
    def forward(self, x):
        return super().forward(x).to(torch.float32)
    model.lm_head = CastOutputToFloat(model.lm_head)

In [4]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"trainable params: {trainable_params} || all parmas: {all_param} || trainable%: {100*trainable_params/all_param}")

In [None]:
config = LoraConfig(r=16, #attention heads
                    lora_alpha=32, #alpha scaling
                    lora_dropout=0.05,
                    bias='none',
                    task_type="CAUSAL_LM" # set this for CLM or Seq2Seq
                   )
                    
model = get_peft_model(model, config)
print_trainable_parameters(model)

In [6]:
data = load_dataset("Abirate/english_quotes")

Downloading readme:   0%|          | 0.00/5.55k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/647k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [21]:
data['train'][-1]

{'quote': '“Silence is so freaking loud”',
 'author': 'Sarah Dessen,',
 'tags': ['just-listen', 'loud', 'owen', 'sara-dessen', 'silence']}

In [22]:
def merge_columns(example):
    example["prediction"] = example["quote"] + " ->: " + str(example["tags"])
    return example

data['train'] = data['train'].map(merge_columns)
data['train']["prediction"][:5]

Map:   0%|          | 0/2508 [00:00<?, ? examples/s]

ImportError: DLL load failed while importing _imaging: The specified module could not be found.

In [None]:
data = data.map(lambda samples: tokenizer(samples['prediction']), batched=True)

In [None]:
trainer = transformer.Trainer(model=model,
                              train_dataset=data['train'],
                              args=transformers.TrainingArguments(
                                  per_device_train_batch_size=4,
                                  gradient_accumulation_steps=4,
                                  warmup_steps=100,
                                  mex_steps=200,
                                  learning_rate=2e-4,
                                  fp16=True,
                                  loggin_steps=1,
                                  output_dir='output'
                             ),
                             data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False))

# silence the warnings. Please re-enable for inference
model.config.use_cache = False
trainer.train()

In [None]:
model.push_to_hub("conzchung/flan-t5-xxl",
                  use_auth_token=True,
                  commit_message="basic training",
                  private=True)

In [None]:
peft_model_id = 'conzchung/flan-t5-xxl'
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, 
                                             return_dict=True, 
                                             load_in_8bit=True, 
                                             device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

model = PeftModel.from_pretrained(model, peft_model_id)

In [None]:
# Inference
batch = tokenizer("“Training models with PEFT and LoRa is cool” ->: ", return_tensors='pt')

# with torch.cuda.amp.autocast():
with torch.autocast('cuda', dtype=torch.bfloat16):
    output_tokens = model.generate(**batch, max_new_tokens=50)

print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True))