[Source](https://colab.research.google.com/drive/1BiQiw31DT7-cDp1-0ySXvvhzqomTdI-o?usp=sharing&pli=1&authuser=5#scrollTo=_kbS7nRxcMt7)

In [1]:
!pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git
!pip install -q datasets bitsandbytes einops wandb

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.9/133.9 kB[0m [31m896.3 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m41.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.1 MB/s

In [9]:
from datasets import load_dataset
dataset_name = "OpenAssistant/oasst1"
dataset = load_dataset(dataset_name, split="train")


def prep_data(df):
    df_assistant = df[(df.role == "assistant") & (df["rank"] == 0.0)].copy()
    df_prompter = df[(df.role == "prompter")].copy()
    df_prompter = df_prompter.set_index("message_id")
    df_assistant["output"] = df_assistant["text"].values

    inputs = []
    parent_ids = []
    for _, row in df_assistant.iterrows():
        input = df_prompter.loc[row.parent_id]
        inputs.append(input.text)
        parent_ids.append(input.parent_id)

    df_assistant["instruction"] = inputs
    df_assistant["parent_id"] = parent_ids

    df_assistant = df_assistant[df_assistant.lang == "en"]

    df_assistant = df_assistant[
        ["instruction", "output", "message_id", "parent_id"]
    ].rename(columns={"message_id": "id"})

    return df_assistant
df_train = prep_data(dataset.to_pandas())

In [10]:
import datasets
from datasets import Dataset
dataset = Dataset.from_pandas(df_train)
train_dataset_mapped = dataset.map(lambda example: {'text': [f'[INST] <>\n' + prompt + ' [/INST] ' + response for prompt, response in zip(example['instruction'], example['output'])]}, batched=True)

Map:   0%|          | 0/7856 [00:00<?, ? examples/s]

In [11]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer

model_name = "microsoft/phi-2"
# model_name = "ybelkada/falcon-7b-sharded-bf16"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True
)
model.config.use_cache = False

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
model

PhiForCausalLM(
  (transformer): PhiModel(
    (embd): Embedding(
      (wte): Embedding(51200, 2560)
      (drop): Dropout(p=0.0, inplace=False)
    )
    (h): ModuleList(
      (0-31): 32 x ParallelBlock(
        (ln): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.1, inplace=False)
        (mixer): MHA(
          (rotary_emb): RotaryEmbedding()
          (Wqkv): Linear4bit(in_features=2560, out_features=7680, bias=True)
          (out_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (inner_attn): SelfAttention(
            (drop): Dropout(p=0.0, inplace=False)
          )
          (inner_cross_attn): CrossAttention(
            (drop): Dropout(p=0.0, inplace=False)
          )
        )
        (mlp): MLP(
          (fc1): Linear4bit(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear4bit(in_features=10240, out_features=2560, bias=True)
          (act): NewGELUActivation()
        )
      )

In [14]:
from peft import LoraConfig

lora_alpha = 16
lora_dropout = 0.1
lora_r = 64

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "Wqkv", #"query_key_value",
        "out_proj", #"dense",
        "fc1", #"dense_h_to_4h",
        "fc2", #"dense_4h_to_h",
    ]
)

In [15]:
from transformers import TrainingArguments

output_dir = "./results"
per_device_train_batch_size = 2
gradient_accumulation_steps = 8
optim = "paged_adamw_32bit"
save_steps = 50
logging_steps = 10
learning_rate = 2e-4
max_grad_norm = 0.3
max_steps = 500
warmup_ratio = 0.03
lr_scheduler_type = "constant"

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
    #gradient_checkpointing=True,
)


In [18]:
from trl import SFTTrainer

max_seq_length = 256

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset_mapped,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
)

Map:   0%|          | 0/7856 [00:00<?, ? examples/s]

In [19]:
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

In [20]:
trainer.train()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a CodeGenTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10,1.6533
20,1.4485
30,1.5812
40,1.8871
50,1.9439
60,1.4546
70,1.4085
80,1.473
90,1.8777
100,1.9459


TrainOutput(global_step=500, training_loss=1.6256620540618896, metrics={'train_runtime': 3351.7142, 'train_samples_per_second': 2.387, 'train_steps_per_second': 0.149, 'total_flos': 2.4271959273984e+16, 'train_loss': 1.6256620540618896, 'epoch': 1.02})

In [None]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)

In [None]:
# Run text generation pipeline with our next model
prompt = "What is a large language model?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])



<s>[INST] What is a large language model? [/INST]

A large language model is a type of artificial intelligence model that is trained on a large amount of text data. It is designed to be able to generate human-like text based on the input it receives. Large language models are used in a variety of applications, including chatbots, language translation, and text generation.

Large language models are trained using a technique called deep learning, which involves training a neural network on large amounts of text data. The neural network is able to learn patterns in the data and use this information to generate new text.

Large language models are able to generate text that is similar to human-generated text in many ways. They can generate text that is coherent, grammatically correct, and contextually appropriate. However, they are not perfect and can make mistakes, such as generating offensive or inappropriate text.

Large language models are constantly being improved and updated, and ne

In [21]:
!zip -r trained_model2.zip /content/results/checkpoint-500

  adding: content/results/checkpoint-500/ (stored 0%)
  adding: content/results/checkpoint-500/special_tokens_map.json (deflated 75%)
  adding: content/results/checkpoint-500/adapter_model.safetensors (deflated 7%)
  adding: content/results/checkpoint-500/vocab.json (deflated 59%)
  adding: content/results/checkpoint-500/adapter_config.json (deflated 49%)
  adding: content/results/checkpoint-500/added_tokens.json (deflated 84%)
  adding: content/results/checkpoint-500/scheduler.pt (deflated 57%)
  adding: content/results/checkpoint-500/trainer_state.json (deflated 86%)
  adding: content/results/checkpoint-500/optimizer.pt (deflated 9%)
  adding: content/results/checkpoint-500/README.md (deflated 66%)
  adding: content/results/checkpoint-500/tokenizer_config.json (deflated 94%)
  adding: content/results/checkpoint-500/merges.txt (deflated 53%)
  adding: content/results/checkpoint-500/rng_state.pth (deflated 25%)
  adding: content/results/checkpoint-500/tokenizer.json (deflated 72%)
  ad

In [22]:
!cp /content/trained_model2.zip /content/drive/MyDrive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!cp /content/drive/MyDrive/trained_model.zip .

In [5]:
!unzip /content/trained_model.zip -d .

Archive:  /content/trained_model.zip
   creating: ./content/results/checkpoint-500/
  inflating: ./content/results/checkpoint-500/tokenizer_config.json  
  inflating: ./content/results/checkpoint-500/scheduler.pt  
  inflating: ./content/results/checkpoint-500/merges.txt  
  inflating: ./content/results/checkpoint-500/adapter_config.json  
  inflating: ./content/results/checkpoint-500/special_tokens_map.json  
  inflating: ./content/results/checkpoint-500/tokenizer.json  
  inflating: ./content/results/checkpoint-500/README.md  
  inflating: ./content/results/checkpoint-500/trainer_state.json  
  inflating: ./content/results/checkpoint-500/added_tokens.json  
  inflating: ./content/results/checkpoint-500/rng_state.pth  
  inflating: ./content/results/checkpoint-500/training_args.bin  
  inflating: ./content/results/checkpoint-500/vocab.json  
  inflating: ./content/results/checkpoint-500/adapter_model.safetensors  
  inflating: ./content/results/checkpoint-500/optimizer.pt  
