In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from datasets import load_dataset
import evaluate
import numpy as np
from peft import LoraConfig, TaskType, get_peft_model
import torch

In [2]:
model_name = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# tokenizer.model_max_length=1000
tokenizer.padding_side='left'
tokenizer.add_special_tokens({"pad_token": "<pad>"})
pad_token_id = tokenizer.pad_token_id
print(f"pad_token_id: {pad_token_id}")

pad_token_id: 128256


In [3]:
raw_dataset = load_dataset("json", data_files="ft_training_set/digit_train.json")
print(raw_dataset)
print(raw_dataset['train'][0])
raw_dataset = raw_dataset.rename_column("answer", "label")
raw_dataset = raw_dataset.rename_column("instruction", "text")
raw_dataset = raw_dataset.remove_columns(['input', 'output'])
print(raw_dataset)

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'answer'],
        num_rows: 11584
    })
})
{'instruction': "Classify the following tweet for crisis management. Decide if it gives important information that could help during a crisis. Reply with '1' if the tweet provides useful information, or '0' if it does not. Tweet: RT @Cal_OES: PLS SHARE: Weâ€™re capturing wildfire response, recovery info here: https://t.co/r89LKpjLPj https://t.co/HiA1oQF2Ax.", 'input': '', 'output': 1, 'answer': 1}
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 11584
    })
})


In [4]:
# def generate_prompt(example):
#     return f"""{example["text"]}
# ### Answer:{example["answer"]}""" 

In [5]:
def tokenize_function(example):
    # full_prompt = generate_prompt(example)
    # tokenized_full_prompt = tokenizer(full_prompt, truncation=True)
    # tokenized_full_prompt['labels'] = tokenized_full_prompt['input_ids'].copy()
    return tokenizer(example['text'], truncation=True)

In [6]:
tokenized_dataset = raw_dataset.map(tokenize_function)

Map:   0%|          | 0/11584 [00:00<?, ? examples/s]

In [18]:
print(tokenized_dataset['train'])
print(tokenized_dataset['train'][0])

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 11584
})
{'text': "Classify the following tweet for crisis management. Decide if it gives important information that could help during a crisis. Reply with '1' if the tweet provides useful information, or '0' if it does not. Tweet: RT @Cal_OES: PLS SHARE: Weâ€™re capturing wildfire response, recovery info here: https://t.co/r89LKpjLPj https://t.co/HiA1oQF2Ax.", 'label': 1, 'input_ids': [128000, 1999, 1463, 279, 2768, 12072, 369, 11501, 6373, 13, 99981, 422, 433, 6835, 3062, 2038, 430, 1436, 1520, 2391, 264, 11501, 13, 18321, 449, 364, 16, 6, 422, 279, 12072, 5825, 5505, 2038, 11, 477, 364, 15, 6, 422, 433, 1587, 539, 13, 26213, 25, 10860, 571, 9027, 2281, 1600, 25, 393, 7416, 54770, 25, 1226, 9011, 48092, 265, 40880, 93225, 2077, 11, 13654, 3630, 1618, 25, 3788, 1129, 83, 6973, 7534, 4578, 94063, 92216, 12852, 73, 3788, 1129, 83, 6973, 14, 13347, 32, 16, 78, 48, 37, 17, 38942, 13], 'attention_mask':

In [8]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=8, padding=True)
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# data_collator = DataCollatorForSeq2Seq(tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True)

In [9]:
accuracy = evaluate.load("accuracy")

In [10]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

In [11]:
# from transformers import BitsAndBytesConfig
# bnb_config = BitsAndBytesConfig(
#     load_in_8bit=True)

In [12]:
id2label = {0: "NOT-INFORMATIVE", 1: "INFORMATIVE"}
label2id = {"NOT-INFORMATIVE": 0, "INFORMATIVE": 1}

In [19]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=2, id2label=id2label, label2id=label2id,
)
model.config.pad_token_id = pad_token_id
model.resize_token_embeddings(len(tokenizer))

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding(128257, 2048)

In [20]:
lora_config = LoraConfig(
    r=4,
    lora_alpha=64,
    lora_dropout=0.1,
    task_type=TaskType.SEQ_CLS,
    target_modules=["q_proj", "k_proj", "v_proj", "up_proj", "down_proj"],
)

lora_model = get_peft_model(model, lora_config)
lora_model.print_trainable_parameters()

trainable params: 1,904,640 || all params: 1,237,725,184 || trainable%: 0.1539


In [21]:
# training_args = TrainingArguments(
#     output_dir = "my_awesome_model2",
#     per_device_train_batch_size=8,
#     fp16=True,
#     use_cpu=True
# )
training_args = TrainingArguments(
    output_dir = "my_awesome_model2",
    per_device_train_batch_size=8,
    fp16=True,
)

In [22]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [23]:
trainer.train()

Step,Training Loss
500,0.5068
1000,0.4407
1500,0.3873
2000,0.3544
2500,0.3612
3000,0.3366
3500,0.2956
4000,0.2858


TrainOutput(global_step=4344, training_loss=0.365123123534257, metrics={'train_runtime': 3261.5619, 'train_samples_per_second': 10.655, 'train_steps_per_second': 1.332, 'total_flos': 2.0197395327614976e+16, 'train_loss': 0.365123123534257, 'epoch': 3.0})

In [24]:
model.save_pretrained("my_awesome_model2")