In [None]:
%%capture
%pip install -U transformers
%pip install -U datasets
%pip install -U accelerate
%pip install -U peft
%pip install -U trl
%pip install -U bitsandbytes
%pip install -U wandb

In [None]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format,SFTConfig

In [None]:
from huggingface_hub import login
from google.colab import userdata

hf_token = userdata.get("hugging_face")

login(token = hf_token)

wb_token = userdata.get("wandb")

wandb.login(key=wb_token)
run = wandb.init(
    project='WordO - The word generator',
    job_type="training",
    anonymous="allow"
)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdijodaijuc[0m ([33mdevokoder[0m). Use [1m`wandb login --relogin`[0m to force relogin


##Importing base model and dataset

In [None]:
base_model = "meta-llama/Meta-Llama-3-8B"
dataset_name = "dijodaiju/wordo"
new_model = "llama-3-8b-hf-wordo-the-wordgen"


##Setting data type and attention mechanism as "eager"

In [None]:
torch_dtype = torch.float16
attn_implementation = "eager"

## Initializing quantization using bitsandbytes in 4-bit precesion

In [1]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

NameError: name 'BitsAndBytesConfig' is not defined

## Loading tokenizer of Llama3 and initializing chat format

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model)
model, tokenizer = setup_chat_format(model, tokenizer)

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)
model = get_peft_model(model, peft_config)

In [None]:
#Importing the dataset
dataset = load_dataset(dataset_name, split="all")
dataset = dataset.shuffle(seed=65).select(range(1000)) # Only use 1000 samples for quick demo

def format_chat_template(row):
    row_json = [{"role": "user", "content": row["Meaning"]},
               {"role": "assistant", "content": row["Word"]}]
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

dataset = dataset.map(
    format_chat_template,
    num_proc=4,
)

dataset['text'][3]

Downloading readme:   0%|          | 0.00/155 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading data:   0%|          | 0.00/750k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/13145 [00:00<?, ? examples/s]

  self.pid = os.fork()


Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

'<|im_start|>user\nlikely to do something<|im_end|>\n<|im_start|>assistant\nApt<|im_end|>\n'

In [None]:
dataset=dataset.train_test_split(test_size=0.1)
dataset['validation']=dataset['test']

In [None]:
training_arguments = SFTConfig(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    evaluation_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="wandb"
)



In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config,
    max_seq_length=512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/900 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
trainer.train()



Step,Training Loss,Validation Loss
90,1.2426,1.636251
180,1.5277,1.622633
270,1.3601,1.585158
360,1.7167,1.571052
450,1.5005,1.550521


TrainOutput(global_step=450, training_loss=1.703377644220988, metrics={'train_runtime': 642.5674, 'train_samples_per_second': 1.401, 'train_steps_per_second': 0.7, 'total_flos': 908341983805440.0, 'train_loss': 1.703377644220988, 'epoch': 1.0})

In [None]:
wandb.finish()
model.config.use_cache = True

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/loss,█▇▄▃▁
eval/runtime,▁▅▇██
eval/samples_per_second,█▄▂▁▁
eval/steps_per_second,█▄▂▁▁
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/grad_norm,▅▆▃▂█▂▃▂▃▂▂▃▂▃▂▂▂▃▂▂▂▁▂▂▂▂▂▂▂▂▁▂▂▂▂▁▂▁▁▂
train/learning_rate,▄███▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
train/loss,█▃▂▂▃▂▂▂▂▃▂▂▂▃▂▂▂▃▂▃▂▂▄▄▂▂▂▂▄▄▁▂▂▃▂▁▂▂▁▂

0,1
eval/loss,1.55052
eval/runtime,25.0306
eval/samples_per_second,3.995
eval/steps_per_second,3.995
total_flos,908341983805440.0
train/epoch,1.0
train/global_step,450.0
train/grad_norm,2.93002
train/learning_rate,0.0
train/loss,1.5005


In [None]:
messages = [
    {
        "role": "user",
        "content": "it was very tiring and difficult"
    }
]

prompt = tokenizer.apply_chat_template(messages, tokenize=False,
                                       add_generation_prompt=True)

inputs = tokenizer(prompt, return_tensors='pt', padding=True,
                   truncation=True).to("cuda")

outputs = model.generate(**inputs, max_length=50,
                         num_return_sequences=1)

text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text.split("assistant")[1])

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



Exhausting 2
Arduous 3
Laborious 4
Harrowing 5
Draining 6
Fatiguing 7

