Install dependencies

In [1]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# We have to check which Torch version for Xformers (2.3 -> 0.0.27)
from torch import __version__; from packaging.version import Version as V
xformers = "xformers==0.0.27" if V(__version__) < V("2.4.0") else "xformers"
!pip install --no-deps {xformers} trl peft accelerate bitsandbytes triton

In [2]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets
!pip install trl

  Installing build dependencies ... [?25l[?25hcanceled
[31mERROR: Operation cancelled by user[0m[31m
[0mTraceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/base_command.py", line 179, in exc_logging_wrapper
    status = run_func(*args)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/req_command.py", line 67, in wrapper
    return func(self, options, args)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/commands/install.py", line 377, in run
    requirement_set = resolver.resolve(
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/resolution/resolvelib/resolver.py", line 76, in resolve
    collected = self.factory.collect_root_requirements(root_reqs)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/resolution/resolvelib/factory.py", line 538, in collect_root_requirements
    reqs = list(
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/resolution/resolvelib/factory

In [3]:
from unsloth import FastLanguageModel
import torch
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict
from dataclasses import dataclass, field
from typing import Optional
import torch
from peft import LoraConfig
from tqdm import tqdm
import pandas as pd
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
import re
from transformers import TextStreamer
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


Load models

In [4]:
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.27.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [5]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [6]:
train_data = pd.read_csv('train.csv', sep='\t')
validation_data = pd.read_csv('validation.csv', sep='\t')
dataset_dict = DatasetDict({
    'train': train_data,
    'validation': validation_data
})
train_data = dataset_dict['train']
print(train_data)
instruction = "### Classify the sentiment of the following sentence as negative, neutral or positive."
train_data['instruction'] = instruction
print(train_data)
validation_data = dataset_dict['validation']
print(validation_data)
dataset = Dataset.from_pandas(train_data)
print(dataset)

                                                   text     label
0     The production is to be liquidated before June...  negative
1     The market making in accordance with the agree...   neutral
2     The largest construction company in Finland , ...   neutral
3     The transaction , which includes US$ 1.5 billi...   neutral
4     MADISON , Wis. , Feb. 6 - PRNewswire - -- Fisk...  positive
...                                                 ...       ...
3401  Based on the first quarter result , existing o...  negative
3402  Aldata said that there are still a number of o...   neutral
3403  The casing comprises a first side casing membe...   neutral
3404  The most significant capital expenditure items...   neutral
3405  Stockmann MasterCard has widened the scope of ...  positive

[3406 rows x 2 columns]
                                                   text     label  \
0     The production is to be liquidated before June...  negative   
1     The market making in accordance with th

<a name="Data"></a>
### Data Prep
We now use the dataset that trains Finbert.

In [7]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    # instructions = examples["instruction"]
    # inputs       = examples["input"]
    # outputs      = examples["output"]
    instructions = examples["instruction"]
    inputs       = examples["text"]
    outputs      = examples["label"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
# dataset = load_dataset("yahma/alpaca-cleaned", split = "train")
print("dataset is",dataset)
dataset = dataset.map(formatting_prompts_func, batched = True,)


dataset is Dataset({
    features: ['text', 'label', 'instruction'],
    num_rows: 3406
})


Map:   0%|          | 0/3406 [00:00<?, ? examples/s]

<a name="Train"></a>
### Train the model
Use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). Run 3 epochs

In [10]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 1, # Set this for 1 full training run.
        # max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=2):   0%|          | 0/3406 [00:00<?, ? examples/s]

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
5.984 GB of memory reserved.


In [11]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 3,406 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 425
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,0.9659
2,0.903
3,0.8698
4,0.9232
5,0.7426
6,0.9198
7,0.8534
8,0.9386
9,1.0545
10,0.8392


In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

456.3689 seconds used for training.
7.61 minutes used for training.
Peak reserved memory = 7.922 GB.
Peak reserved memory for training = 1.938 GB.
Peak reserved memory % of max memory = 53.716 %.
Peak reserved memory for training % of max memory = 13.141 %.


<a name="Inference"></a>
### Inference
Run the model

In [19]:
import re
import pandas as pd
validation_data = pd.read_csv('validation.csv', sep='\t')
print(validation_data)
answers = []
for i in range(len(validation_data['text'])):
# for i in range(5):
  FastLanguageModel.for_inference(model) # Enable native 2x faster inference
  inputs = tokenizer(
  [
      alpaca_prompt.format(
          "### Classify the sentiment of the following sentence as negative, neutral or positive.", # instruction
          validation_data['text'][i], # input
          "", # output - leave this blank for generation!
      )
  ], return_tensors = "pt").to("cuda")

  outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
  textOutput = tokenizer.batch_decode(outputs)
  textOutput = textOutput[0].lower()
  # print("textOutput is",textOutput)
  start_index = textOutput.find("### response:")
  if start_index != -1:
    result = textOutput[start_index:]
    # print(result)
    if("positive" in result):
      # print("positive")
      answers.append("positive")
    if("neutral" in result):
      # print("neutral")
      answers.append("neutral")
    if("negative" in result):
      # print("negative")
      answers.append("negative")
  else:
      answers.append("error")
      print("Response section not found.")

# print("answers are", answers)
print("length of answers is",len(answers))
print("length of validation['label'] is",len(validation_data['label']))
correctPre = 0
for i in range(min(len(answers),len(validation_data['label']))):
  if(answers[i] == validation_data['label'][i]):
    correctPre += 1
print(correctPre)
accuracy = correctPre / len(answers)
print("accuracy is",accuracy)

                                                  text     label
0    According to Scanfil , demand for telecommunic...  negative
1    Kemira 's partner in the project is St. Peters...   neutral
2    Finnish Rautaruukki has been awarded a contrac...  positive
3    It is also 7.7 pct above the 12-month volume w...  positive
4    Operating profit for the three-month period de...  negative
..                                                 ...       ...
373  Benefon , a wireless and GPS technology compan...  positive
374  In 2008 , Kemira recorded revenue of approxima...   neutral
375  EB announced in its stock exchange release on ...  negative
376  The group will expand the chain to include 45 ...  positive
377  The buyer is real estate owner Propertos Oy , ...   neutral

[378 rows x 2 columns]
length of answers is 378
length of validation['label'] is 378
322
accuracy is 0.8518518518518519


<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. Refer to below if you need save to 16bit or GGUF.

In [14]:
model.save_pretrained("lora_model") # Local saving
tokenizer.save_pretrained("lora_model")
model.push_to_hub("cychiuak/llama3_4bit_fineTune_V4", token = "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") # Online saving
tokenizer.push_to_hub("cychiuak/llama3_4bit_fineTune_V4", token = "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") # Online saving

README.md:   0%|          | 0.00/589 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

Saved model to https://huggingface.co/cychiuak/llama3_4bit_fineTune_V4


### Saving to float16 for VLLM

Upload to hugging face

In [15]:
# Merge to 16bit
if True: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if True: model.push_to_hub_merged("cychiuak/llama3_16bit_fineTune_V4", tokenizer, save_method = "merged_16bit", token = "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")


Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which will take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 5.7G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 5.22 out of 12.67 RAM for saving.


 34%|███▍      | 11/32 [00:00<00:01, 13.41it/s]We will save to Disk and not RAM now.
100%|██████████| 32/32 [00:41<00:00,  1.28s/it]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Unsloth: Saving model/pytorch_model-00001-of-00004.bin...
Unsloth: Saving model/pytorch_model-00002-of-00004.bin...
Unsloth: Saving model/pytorch_model-00003-of-00004.bin...
Unsloth: Saving model/pytorch_model-00004-of-00004.bin...
Done.


Unsloth: You are pushing to hub, but you passed your HF username = cychiuak.
We shall truncate cychiuak/llama3_16bit_fineTune_V4 to llama3_16bit_fineTune_V4


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 5.19 out of 12.67 RAM for saving.


100%|██████████| 32/32 [01:10<00:00,  2.22s/it]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Unsloth: Saving llama3_16bit_fineTune_V4/pytorch_model-00001-of-00004.bin...
Unsloth: Saving llama3_16bit_fineTune_V4/pytorch_model-00002-of-00004.bin...
Unsloth: Saving llama3_16bit_fineTune_V4/pytorch_model-00003-of-00004.bin...
Unsloth: Saving llama3_16bit_fineTune_V4/pytorch_model-00004-of-00004.bin...


README.md:   0%|          | 0.00/589 [00:00<?, ?B/s]

  0%|          | 0/4 [00:00<?, ?it/s]

pytorch_model-00004-of-00004.bin:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

pytorch_model-00001-of-00004.bin:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00004.bin:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

pytorch_model-00003-of-00004.bin:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Done.
Saved merged model to https://huggingface.co/cychiuak/llama3_16bit_fineTune_V4


Save to GGUF/16bit

In [16]:

# Save to 16bit GGUF
if True: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if True: model.push_to_hub_gguf("cychiuak/llama3_16bit_GGUF_fineTune_V4", tokenizer, quantization_method = "f16", token = "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")


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 5.46 out of 12.67 RAM for saving.


100%|██████████| 32/32 [01:58<00:00,  3.71s/it]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Unsloth: Saving model/pytorch_model-00001-of-00004.bin...
Unsloth: Saving model/pytorch_model-00002-of-00004.bin...
Unsloth: Saving model/pytorch_model-00003-of-00004.bin...
Unsloth: Saving model/pytorch_model-00004-of-00004.bin...
Done.


Unsloth: Converting llama model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp will take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits will take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['f16'] will take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: [0] Installing llama.cpp. This will take 3 minutes...
Unsloth: [1] Converting model at model into f16 GGUF format.
The output location will be ./model/unsloth.F16.gguf
This will take 3 minutes...
INFO:hf-to-gguf:Loading model: model
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model weight map from 'pytorch_model.bin.index.json'
INFO:hf-to-gguf:gguf: loading model part 'pytorch_model-00001-of-00004.bin'
INFO:hf-to-gguf:token_embd.weight,           torch.float16 --> F16, shape = {4096, 128256}
INFO:hf-to-gguf:blk.0.attn_q.weight,         torch.float16 --> F

100%|██████████| 32/32 [01:16<00:00,  2.40s/it]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Unsloth: Saving cychiuak/llama3_16bit_GGUF_fineTune_V4/pytorch_model-00001-of-00004.bin...
Unsloth: Saving cychiuak/llama3_16bit_GGUF_fineTune_V4/pytorch_model-00002-of-00004.bin...
Unsloth: Saving cychiuak/llama3_16bit_GGUF_fineTune_V4/pytorch_model-00003-of-00004.bin...
Unsloth: Saving cychiuak/llama3_16bit_GGUF_fineTune_V4/pytorch_model-00004-of-00004.bin...
Done.
==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp will take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits will take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['f16'] will take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: [0] Installing llama.cpp. This will take 3 minutes...
Unsloth: [1] Converting model at cychiuak/llama3_16bit_GGUF_fineTune_V4 into f16 GGUF format.
The output location will be ./c

  0%|          | 0/1 [00:00<?, ?it/s]

unsloth.F16.gguf:   0%|          | 0.00/16.1G [00:00<?, ?B/s]

Saved GGUF to https://huggingface.co/cychiuak/llama3_16bit_GGUF_fineTune_V4
