In [1]:
%%time

print(">>> bitsandbytes")
!pip install -qqq bitsandbytes

print(">>> transformers")
!pip install -qqq transformers

print(">>> peft")
!pip install -qqq peft

print(">>> accelerate")
!pip install -qqq accelerate

print(">>> datasets")
!pip install -qqq datasets

print(">>> trl")
!pip install -qqq trl

print(">>> flash_attn")
!pip install -qqq flash_attn

print(">>> huggingface_hub")
!pip install -qqq huggingface_hub

print(">>> absl-py")
!pip install -qqq absl-py

print(">>> nltk")
!pip install -qqq nltk

print(">>> rouge_score")
!pip install -qqq rouge_score

>>> bitsandbytes
>>> transformers
>>> peft
>>> accelerate
>>> datasets
>>> trl
>>> flash_attn
>>> huggingface_hub
>>> absl-py
>>> nltk
>>> rouge_score
CPU times: user 1.21 s, sys: 191 ms, total: 1.4 s
Wall time: 2min 21s


In [2]:
import torch
import bitsandbytes
import peft
import accelerate
import datasets
import trl

print("torch version:", torch.__version__)
print("bitsandbytes version:", bitsandbytes.__version__)
print("peft version:", peft.__version__)
print("accelerate version:", accelerate.__version__)
print("datasets version:", datasets.__version__)
print("trl version:", trl.__version__)

torch version: 2.3.1
bitsandbytes version: 0.43.1
peft version: 0.11.1
accelerate version: 0.32.1
datasets version: 2.20.0
trl version: 0.9.4


In [3]:
!nvidia-smi

Sat Jul  6 22:27:02 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 2060 ...    Off | 00000000:01:00.0 Off |                  N/A |
| 32%   30C    P8               1W / 175W |      8MiB /  8192MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [3]:
import os
from random import randrange

import torch
import numpy as np
import pandas as pd
from huggingface_hub import login
from datasets import load_dataset, Dataset

from trl import SFTConfig, SFTTrainer
from peft import LoraConfig, prepare_model_for_kbit_training, TaskType, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    set_seed,
    pipeline
)

In [9]:
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
os.environ['TORCH_USE_CUDA_DSA'] = "1"
os.environ["HF_TOKEN"] = "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"

LOCAL_MODELPATH = "Phi3-FT-Lora"

login(token=os.environ["HF_TOKEN"], add_to_git_credential=True)

model_name = "microsoft/Phi-3-mini-4k-instruct"

set_seed(1234)

Token is valid (permission: read).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# Dataset

In [20]:
import re
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov|edu|me)"
digits = "([0-9])"
multiple_dots = r'\.{2,}'

def split_into_sentences(text: str) -> list[str]:
    """
    Split the text into sentences.

    If the text contains substrings "<prd>" or "<stop>", they would lead 
    to incorrect splitting because they are used as markers for splitting.

    :param text: text to be split into sentences
    :type text: str

    :return: list of sentences
    :rtype: list[str]
    """
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    text = re.sub(digits + "[.]" + digits,"\\1<prd>\\2",text)
    text = re.sub(multiple_dots, lambda match: "<prd>" * len(match.group(0)) + "<stop>", text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = [s.strip() for s in sentences]
    if sentences and not sentences[-1]: sentences = sentences[:-1]
    return sentences

In [15]:
with open("data/anne.txt", "r") as file:
    anne = file.read()

In [22]:
sentences = split_into_sentences(anne)

In [32]:
import pandas as pd
dataset = Dataset.from_pandas(
    pd.DataFrame(
        [(sentences[i], sentences[i+1]) for i in range(len(sentences)-1)],
        columns=["prompt", "completion"]
    )
)

In [35]:
compute_dtype = torch.float16
attn_implementation = 'eager'
    
print(attn_implementation)
print(compute_dtype)

eager
torch.float16


# Modelo

In [36]:
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype="bfloat16",
        bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=compute_dtype,
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation,
)

model = prepare_model_for_kbit_training(model)

config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/73.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/16.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

In [39]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, add_eos_token=True, use_fast=True, device_map="auto")
tokenizer.pad_token = tokenizer.unk_token
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
tokenizer.padding_side = 'right'

tokenizer_config.json:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Preprocessing

In [40]:
def format_dataset_chatml(row):
    messages = [
        {
            "content": f"{row['prompt']}",
            "role": "user"
        },
        {
            "content": f"{row['completion']}",
            "role": "assistant"
        }
    ]

    return {"text": tokenizer.apply_chat_template(messages, add_generation_prompt=False, tokenize=False)}



dataset_chatml = dataset.map(format_dataset_chatml)
dataset_chatml = dataset_chatml.train_test_split(test_size=0.05, seed=1234)
dataset_chatml

Map:   0%|          | 0/7037 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion', 'text'],
        num_rows: 6685
    })
    test: Dataset({
        features: ['prompt', 'completion', 'text'],
        num_rows: 352
    })
})

# TRainer

In [42]:
%%time

sft_config = SFTConfig(
    dataset_text_field="text",
    max_seq_length=512,
    output_dir=LOCAL_MODELPATH,
    eval_strategy="steps",
    do_eval=True,
    optim="adamw_torch",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=8,
    log_level="debug",
    save_strategy="epoch",
    logging_steps=1,
    learning_rate=1e-2,
    fp16 = not torch.cuda.is_bf16_supported(),
    bf16 = torch.cuda.is_bf16_supported(),
    eval_steps=5,
    num_train_epochs=10,
    warmup_ratio=0.1,
    lr_scheduler_type="linear",
    report_to="none",
    seed=42,
)

peft_config = LoraConfig(
        r=16,
        lora_alpha=16,
        lora_dropout=0.05,
        task_type=TaskType.CAUSAL_LM,
        target_modules=['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"],
)

trainer = SFTTrainer(
    model,
    train_dataset=dataset_chatml['train'],
    eval_dataset=dataset_chatml['test'],
    args=sft_config,
    peft_config=peft_config,
    tokenizer=tokenizer,
)

Map:   0%|          | 0/6685 [00:00<?, ? examples/s]

Map:   0%|          | 0/352 [00:00<?, ? examples/s]

Using auto half precision backend


CPU times: user 1.74 s, sys: 12 ms, total: 1.75 s
Wall time: 756 ms


#### Callbacks

In [43]:
%%time

trainer.train()

trainer.save_model()

Currently training with a batch size of: 8
***** Running training *****
  Num examples = 6,685
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 2,090
  Number of trainable parameters = 8,912,896
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
You are not running the flash-attention implementation, expect numerical differences.


Step,Training Loss,Validation Loss
5,4.0545,4.065436
10,3.3717,3.239837
15,2.902,2.771523
20,2.7392,2.627663
25,2.606,2.546114
30,2.7772,2.518483
35,2.5435,2.472442
40,2.4711,2.450773
45,2.5578,2.430195
50,2.4118,2.418597



***** Running Evaluation *****
  Num examples = 352
  Batch size = 8

***** Running Evaluation *****
  Num examples = 352
  Batch size = 8

***** Running Evaluation *****
  Num examples = 352
  Batch size = 8

***** Running Evaluation *****
  Num examples = 352
  Batch size = 8

***** Running Evaluation *****
  Num examples = 352
  Batch size = 8

***** Running Evaluation *****
  Num examples = 352
  Batch size = 8

***** Running Evaluation *****
  Num examples = 352
  Batch size = 8

***** Running Evaluation *****
  Num examples = 352
  Batch size = 8

***** Running Evaluation *****
  Num examples = 352
  Batch size = 8

***** Running Evaluation *****
  Num examples = 352
  Batch size = 8

***** Running Evaluation *****
  Num examples = 352
  Batch size = 8

***** Running Evaluation *****
  Num examples = 352
  Batch size = 8

***** Running Evaluation *****
  Num examples = 352
  Batch size = 8

***** Running Evaluation *****
  Num examples = 352
  Batch size = 8

***** Running Evalu

KeyboardInterrupt: 