In [1]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 64 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 2x faster
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # 4bit for 405b!
    "unsloth/Mistral-Small-Instruct-2409",     # Mistral 22b 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!

    "unsloth/Llama-3.2-1B-bnb-4bit",           # NEW! Llama 3.2 models
    "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-3B-bnb-4bit",
    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",

    "unsloth/Llama-3.3-70B-Instruct-bnb-4bit" # NEW! Llama 3.3 70B!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    # model_name = "unsloth/Llama-3.2-3B-Instruct", # or choose "unsloth/Llama-3.2-1B-Instruct"
    model_name = "unsloth/Llama-3.2-1B-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.4.7: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.4.7 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
from datasets import load_dataset
txt_path = '/content/drive/MyDrive/HPML_Project/dataset/finance_corpus.txt'

# one article per line in finance_corpus.txt (make sure you’ve uploaded it)
ds = load_dataset(
    "text",
    data_files={"train": txt_path},
    split="train",
)
ds = ds.filter(lambda x: x["text"].strip() != "")

print(f"Loaded {len(ds)} examples; sample text:")
print(ds[0]["text"][:200].replace("\n"," "), "…")

Loaded 227699 examples; sample text:
Article 1: Warren Buffett Autographed Books To Help Charity: Here's How You Can Get Legendary Investor's Signature …


In [6]:
for text in ds["text"][:10]:
    toks = tokenizer(text, add_special_tokens=False)
    print(len(toks["input_ids"]), "tokens")

23 tokens
41 tokens
29 tokens
48 tokens
48 tokens
24 tokens
36 tokens
33 tokens
20 tokens
19 tokens


In [7]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq, DataCollatorForLanguageModeling
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = ds,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    # data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False),

    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 64,
        gradient_accumulation_steps = 8,
        warmup_steps = 5,
        num_train_epochs = 5, # Set this for 1 full training run.
        # max_steps = 60,
        learning_rate = 1e-5,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 100,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/227699 [00:00<?, ? examples/s]

In [8]:
tokenizer.decode(trainer.train_dataset[0]["input_ids"])

"<|begin_of_text|>Article 1: Warren Buffett Autographed Books To Help Charity: Here's How You Can Get Legendary Investor's Signature"

In [9]:
trainer.train_dataset

Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 227699
})

In [10]:
trainer_stats = trainer.train()


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 227,699 | Num Epochs = 5 | Total steps = 2,220
O^O/ \_/ \    Batch size per device = 64 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (64 x 8 x 1) = 512
 "-____-"     Trainable parameters = 11,272,192/1,000,000,000 (1.13% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
100,3.1283
200,2.8796
300,2.7742
400,2.6929
500,2.668
600,2.5938
700,2.5604
800,2.5348
900,2.5359
1000,2.4828


In [11]:
FastLanguageModel.for_inference(model)

def answer(prompt: str,
           max_new_tokens: int = 128,
           temperature: float    = 0.2,
           top_p: float          = 0.7,
           repetition_penalty: float = 1.2,
           no_repeat_ngram_size: int = 3):
    # 1) tokenize
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=512
    ).to(model.device)

    input_ids = inputs["input_ids"]

    # 2) generate with anti‐repetition tweaks
    outputs = model.generate(
        **inputs,
        max_new_tokens       = max_new_tokens,
        temperature          = temperature,
        top_p                = top_p,
        do_sample            = True,
        repetition_penalty   = repetition_penalty,
        no_repeat_ngram_size = no_repeat_ngram_size,
        eos_token_id         = tokenizer.eos_token_id,
        pad_token_id         = tokenizer.pad_token_id,
        early_stopping       = True,
    )

    # 3) strip off prompt‐tokens and decode only the new ones
    gen_ids = outputs[0][ input_ids.shape[-1] : ]
    return tokenizer.decode(gen_ids, skip_special_tokens=True).strip()

# Try it out
print(answer("How is starbucks doing?"))

The company's stock has outperformed the Zacks Retail - Restaurants industry over the past year (+12.6% vs. +1%). This performance was driven by strong sales growth, which were fueled by a 7.9% increase in comparable store sales and an impressive 8.4% rise in total revenue. Starbucks' solid financials also helped it beat Wall Street estimates for both earnings per share (EPS) and revenues on two occasions during fiscal 2025. Its EPS of $2.74 surpassed analysts’ expectations by 10%. It delivered a significant improvement from its prior-year figure as well: EPS rose


In [12]:
save_path = "/content/drive/MyDrive/HPML_Project/unsloth_a100_2"   # <— adjust if your folder is nested

# 1) Save LoRA adapter + config
trainer.save_model(save_path)

# 2) Save tokenizer files
tokenizer.save_pretrained(save_path)

print(" Saved adapters + tokenizer to", save_path)

 Saved adapters + tokenizer to /content/drive/MyDrive/HPML_Project/unsloth_a100_2


## Inference


In [20]:
# inference.py

import torch
from peft import prepare_model_for_kbit_training, PeftModel
from unsloth import FastLanguageModel

# 1) Load the same 4-bit base + tokenizer you fine-tuned on
base, tokenizer = FastLanguageModel.from_pretrained(
    model_name     = "unsloth/Llama-3.2-1B-bnb-4bit",  # your base
    max_seq_length = 128,
    dtype          = torch.float16,                   # or None for auto
    load_in_4bit   = True,
    device_map     = "auto",
)

# ensure pad/eos tokens are set
tokenizer.pad_token = tokenizer.eos_token
base.config.pad_token_id = tokenizer.pad_token_id
base.config.use_cache      = True

# 2) Patch for QLoRA / k-bit adapters
base = prepare_model_for_kbit_training(base)

# 3) Load your fine-tuned LoRA adapters
model = PeftModel.from_pretrained(
    base,
    "/content/drive/MyDrive/HPML_Project/unsloth_a100_2",     # folder where you saved adapters + tokenizer
    device_map="auto",          # shard onto GPU automatically
)

model.eval()

# 4) Inference helper
def answer(prompt: str,
           max_new_tokens: int = 128,
           temperature: float    = 1.1,
           top_p: float          = 0.9,
           repetition_penalty: float = 1.2,
           no_repeat_ngram_size: int = 3):
    # 1) tokenize
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=512
    ).to(model.device)

    input_ids = inputs["input_ids"]

    # 2) generate with anti‐repetition tweaks
    outputs = model.generate(
        **inputs,
        max_new_tokens       = max_new_tokens,
        temperature          = temperature,
        top_p                = top_p,
        do_sample            = True,
        repetition_penalty   = repetition_penalty,
        no_repeat_ngram_size = no_repeat_ngram_size,
        eos_token_id         = tokenizer.eos_token_id,
        pad_token_id         = tokenizer.pad_token_id,
        early_stopping       = True,
    )

    # 3) strip off prompt‐tokens and decode only the new ones
    gen_ids = outputs[0][ input_ids.shape[-1] : ]
    return tokenizer.decode(gen_ids, skip_special_tokens=True).strip()

# Try it out
print(answer("How is starbucks stock doing?"))

==((====))==  Unsloth 2025.4.7: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [24]:
# inference.py

import torch
from peft import prepare_model_for_kbit_training, PeftModel
from unsloth import FastLanguageModel

# 1) Load the same 4-bit base + tokenizer you fine-tuned on
base, tokenizer = FastLanguageModel.from_pretrained(
    model_name     = "unsloth/Llama-3.2-1B-bnb-4bit",  # your base
    max_seq_length = 128,
    dtype          = torch.float16,                   # or None for auto
    load_in_4bit   = True,
    device_map     = "auto",
)

# ensure pad/eos tokens are set
tokenizer.pad_token = tokenizer.eos_token
base.config.pad_token_id = tokenizer.pad_token_id
base.config.use_cache      = True

# 2) Patch for QLoRA / k-bit adapters
base = prepare_model_for_kbit_training(base)

# 3) Load your fine-tuned LoRA adapters
model = PeftModel.from_pretrained(
    base,
    "/content/drive/MyDrive/HPML_Project/unsloth_a100_2",     # folder where you saved adapters + tokenizer
    device_map="auto",          # shard onto GPU automatically
)

model.eval()

# 4) Inference helper
def answer(prompt: str,
           max_new_tokens: int = 128,
           temperature: float    = 0.7,
           top_p: float          = 0.9,
           repetition_penalty: float = 1.2,
           no_repeat_ngram_size: int = 3):
    # 1) tokenize
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=512
    ).to(model.device)

    input_ids = inputs["input_ids"]

    # 2) generate with anti‐repetition tweaks
    outputs = model.generate(
        **inputs,
        max_new_tokens       = max_new_tokens,
        temperature          = temperature,
        top_p                = top_p,
        do_sample            = True,
        repetition_penalty   = repetition_penalty,
        no_repeat_ngram_size = no_repeat_ngram_size,
        eos_token_id         = tokenizer.eos_token_id,
        pad_token_id         = tokenizer.pad_token_id,
        early_stopping       = True,
    )

    # 3) strip off prompt‐tokens and decode only the new ones
    gen_ids = outputs[0][ input_ids.shape[-1] : ]
    return tokenizer.decode(gen_ids, skip_special_tokens=True).strip()

# Try it out
print(answer("What is the best performing stock?"))

==((====))==  Unsloth 2025.4.7: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Find out in our Top 5 Stocks for this week. Click to read now… Continue Reading about Best Performing Stock: Amphenol (APH) Upgraded To Outperform; Here’s What Analysts Say About It and Other Top Picks Now available on TipRanks.  was originally published by The Motley Fool, hereoriginally posted on March 13, 2025by The Motely Follownowavailable at https://www.themotleyfool.com/stocks/AAMPH-Stock-Tip-Rankings-Amphenol-AAPM-3Q25-Q1FY26-Earnings-Cost-and


In [27]:
# inference.py

import torch
from peft import prepare_model_for_kbit_training, PeftModel
from unsloth import FastLanguageModel

# 1) Load the same 4-bit base + tokenizer you fine-tuned on
base, tokenizer = FastLanguageModel.from_pretrained(
    model_name     = "unsloth/Llama-3.2-1B-bnb-4bit",  # your base
    max_seq_length = 128,
    dtype          = torch.float16,                   # or None for auto
    load_in_4bit   = True,
    device_map     = "auto",
)

# ensure pad/eos tokens are set
tokenizer.pad_token = tokenizer.eos_token
base.config.pad_token_id = tokenizer.pad_token_id
base.config.use_cache      = True

# 2) Patch for QLoRA / k-bit adapters
base = prepare_model_for_kbit_training(base)

# 3) Load your fine-tuned LoRA adapters
model = PeftModel.from_pretrained(
    base,
    "/content/drive/MyDrive/HPML_Project/unsloth_a100_2",     # folder where you saved adapters + tokenizer
    device_map="auto",          # shard onto GPU automatically
)

model.eval()

# 4) Inference helper
def answer(prompt: str,
           max_new_tokens: int = 128,
           temperature: float    = 1,
           top_p: float          = 0.9,
           repetition_penalty: float = 1.2,
           no_repeat_ngram_size: int = 3):
    # 1) tokenize
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=512
    ).to(model.device)

    input_ids = inputs["input_ids"]

    # 2) generate with anti‐repetition tweaks
    outputs = model.generate(
        **inputs,
        max_new_tokens       = max_new_tokens,
        temperature          = temperature,
        top_p                = top_p,
        do_sample            = True,
        repetition_penalty   = repetition_penalty,
        no_repeat_ngram_size = no_repeat_ngram_size,
        eos_token_id         = tokenizer.eos_token_id,
        pad_token_id         = tokenizer.pad_token_id,
        early_stopping       = True,
    )

    # 3) strip off prompt‐tokens and decode only the new ones
    gen_ids = outputs[0][ input_ids.shape[-1] : ]
    return tokenizer.decode(gen_ids, skip_special_tokens=True).strip()

# Try it out
print(answer("What is the news on Lockheed Martin?"))

==((====))==  Unsloth 2025.4.7: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
2.30% (0%) Upward revision of Earnings Per Share Estimates from $6.88 to $7, and a 5-Year EPS Growth Rate Estimate for 23%, up by approximately +8%. What should we expect from this earnings announcement in terms of business outlooks or expectations about growth opportunities around aerospace defense, commercial air transport and space programs as well as new strategic initiatives that may arise with geopolitical events such as tariffs? Do you anticipate any specific challenges related to trade-related risks within your supply 