In [19]:
import os
import logging
import getpass
from dotenv import load_dotenv
from datasets import load_dataset
from unsloth import FastLanguageModel, is_bfloat16_supported
from transformers import TrainingArguments, pipeline
from trl import SFTTrainer

In [20]:
# Set up logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
formatter = logging.Formatter('[%(asctime)s] %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)

# Load environment variables
load_dotenv()
if not os.getenv("HF_TOKEN"):
    os.environ["HF_TOKEN"] = getpass.getpass("Enter your HuggingFace token: ")
hf_token = os.getenv("HF_TOKEN")

hf_model_id = "meta-llama/Llama-3.2-1B-Instruct"
logger.info(f"Model ID: {hf_model_id}")

max_seq_length = 2048
dtype = None
load_in_4bit = True

[2025-04-02 03:33:01,003] INFO - Model ID: meta-llama/Llama-3.2-1B-Instruct
[2025-04-02 03:33:01,003] INFO - Model ID: meta-llama/Llama-3.2-1B-Instruct
[2025-04-02 03:33:01,003] INFO - Model ID: meta-llama/Llama-3.2-1B-Instruct
[2025-04-02 03:33:01,003] INFO - Model ID: meta-llama/Llama-3.2-1B-Instruct


In [21]:
# Load model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = hf_model_id,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = hf_token,
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

==((====))==  Unsloth 2025.2.15: Fast Llama patching. Transformers: 4.49.0.
   \\   /|    GPU: NVIDIA L4. Max memory: 22.045 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [22]:
# Load and prepare dataset
dataset = load_dataset("banking77", split="train")
label_names = dataset.features["label"].names

def format_example(example):
    return {
        "text": f"### Instruction:\nClassify the following customer query into a banking category.\n\n### Input:\n{example['text']}\n\n### Response:\n{label_names[example['label']]}"
    }

dataset = dataset.map(format_example)


In [23]:
# Check precision support
fp16_enabled = not is_bfloat16_supported()
bf16_enabled = is_bfloat16_supported()

# Training
args = TrainingArguments(
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 1,
    max_steps = 600,
    learning_rate = 2e-4,
    fp16 = fp16_enabled,
    bf16 = bf16_enabled,
    logging_steps = 10,
    output_dir = "outputs",
    report_to = "none",
    seed = 3407,
)
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    args = args,
)


trainer.train()

# Save model
model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")

Tokenizing train dataset (num_proc=2): 100%|██████████| 10003/10003 [00:01<00:00, 5882.52 examples/s]
Tokenizing train dataset (num_proc=2): 100%|██████████| 10003/10003 [00:00<00:00, 17991.29 examples/s]
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 10,003 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 1
\        /    Total batch size = 4 | Total steps = 600
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
10,2.5425
20,1.5026
30,1.4589
40,1.134
50,1.0892
60,1.2964
70,1.1849
80,1.0394
90,1.0588
100,1.1213


('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/tokenizer.json')

In [None]:
from unsloth import FastLanguageModel

FastLanguageModel.for_inference(model)

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

test_inputs = [
    "I see a charge on my credit card statement but I paid on time, why?",
    "Do you have a branch in Timbuktu?",
    "I lost my card and my replacement card has not arrived."
]

results = []
for inp in test_inputs:
    prompt = f"""### Instruction:
Classify the following customer query into a banking category.

### Input:
{inp}

### Response:
"""
    out = pipe(prompt, max_new_tokens=20)[0]["generated_text"]
    response = out.split("### Response:")[-1].strip()
    results.append((inp, response))

with open("problem1_task1.txt", "w") as f:
    for inp, response in results:
        f.write(f"input: {inp}\ncategory: {response}\n\n")


Device set to use cuda:0
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DiffLlamaForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GotOcr2ForConditionalGeneration', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausa