# Llama 2 fine-tuning for smishing detection

based on the tutorial ["Fine-Tuning LLaMA 2: A Step-by-Step Guide to Customizing the Large Language Model"](https://www.datacamp.com/tutorial/fine-tuning-llama-2)

In [None]:
%pip install accelerate peft bitsandbytes transformers trl wandb
import os, torch, wandb
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer

Collecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/309.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m307.2/309.4 kB[0m [31m10.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.11.1-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
Collecting trl
  Downloading trl-0.9.4-py3-none-any.whl (226 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[

In [None]:
# Login to Hugging Face and Weights and Biases

from huggingface_hub import login
from google.colab import userdata

hf_token = userdata.get('HuggingFace')

login(token = hf_token)

wb_token = userdata.get('wandb')

wandb.login(key=wb_token)
run = wandb.init(
    project='Smishing detection with fine-tuned Llama 2 13B',
    job_type="training",
    anonymous="allow"
)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdanielhenel[0m ([33mdanielhenel-research[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
# Model from Hugging Face hub
base_model = "NousResearch/Llama-2-13b-chat-hf"
# Fine-tuned model
new_model = "./models/smishing-detection-llama-2-13b-chat"
# Dataset
dataset = load_dataset("text", data_files="./data/llama2_train_data.txt", split="train")

Generating train split: 0 examples [00:00, ? examples/s]

Making the fine-tunning more efficient by using 4-bit quantization

In [None]:
compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

Loading the Llama 2 model and tokenizer

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map={"": 0}
)
model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/33.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/196 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

Parameter-Efficient Fine-Tuning

In [None]:
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

Training parameters

In [None]:
training_params = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="wandb"
)

Supervised fine-tuning

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_params,
    dataset_text_field="text",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_params,
    packing=False,
)

trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/4459 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Step,Training Loss
25,2.6032
50,0.9391
75,1.5078
100,0.7163
125,1.4808
150,0.7371
175,1.503
200,0.7208
225,1.5351
250,0.7152




TrainOutput(global_step=2230, training_loss=0.9802287276015688, metrics={'train_runtime': 3165.8209, 'train_samples_per_second': 2.817, 'train_steps_per_second': 0.704, 'total_flos': 5.694061549403136e+16, 'train_loss': 0.9802287276015688, 'epoch': 2.0})

In [None]:
wandb.finish()
model.config.use_cache = True

VBox(children=(Label(value='0.002 MB of 0.039 MB uploaded\r'), FloatProgress(value=0.05533462657613967, max=1.…

0,1
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/grad_norm,▅▆▃▅▁▁▂▁▂▄▅▃▄▂▁▂▄▄▃▅▃▂▄▂▆█▆█▅▅▃▂▆▆▇▇▄▄▄▄
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/loss,█▄▄▄▁▁▁▁▄▄▄▄▁▁▁▁▃▃▄▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂

0,1
total_flos,5.694061549403136e+16
train/epoch,2.0
train/global_step,2230.0
train/grad_norm,0.22748
train/learning_rate,0.0002
train/loss,0.7811
train_loss,0.98023
train_runtime,3165.8209
train_samples_per_second,2.817
train_steps_per_second,0.704


Save the model and the tokenizer

In [None]:
trainer.model.save_pretrained(new_model)
trainer.tokenizer.save_pretrained(new_model)



('/content/drive/MyDrive/Github/research/smishing detection/Llama/1/models/smishing-detection-llama-2-13b-chat/tokenizer_config.json',
 '/content/drive/MyDrive/Github/research/smishing detection/Llama/1/models/smishing-detection-llama-2-13b-chat/special_tokens_map.json',
 '/content/drive/MyDrive/Github/research/smishing detection/Llama/1/models/smishing-detection-llama-2-13b-chat/tokenizer.model',
 '/content/drive/MyDrive/Github/research/smishing detection/Llama/1/models/smishing-detection-llama-2-13b-chat/added_tokens.json',
 '/content/drive/MyDrive/Github/research/smishing detection/Llama/1/models/smishing-detection-llama-2-13b-chat/tokenizer.json')

In [None]:
trainer.model.push_to_hub("danielhenel/smishing-detection-llama-2-13b-chat")

adapter_model.safetensors:   0%|          | 0.00/210M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/danielhenel/smishing-detection-llama-2-13b-chat/commit/cf6a55ed51e8b5404a384940bca8bf977d8fdc3d', commit_message='Upload model', commit_description='', oid='cf6a55ed51e8b5404a384940bca8bf977d8fdc3d', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
tokenizer.push_to_hub("danielhenel/smishing-detection-llama-2-13b-chat")

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/danielhenel/smishing-detection-llama-2-13b-chat/commit/bf29006af04dba97ed6a05934bb1f05212816b42', commit_message='Upload tokenizer', commit_description='', oid='bf29006af04dba97ed6a05934bb1f05212816b42', pr_url=None, pr_revision=None, pr_num=None)