In [1]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 512 # chosen for optimum results and training time
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 2x faster
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # 4bit for 405b!
    "unsloth/Mistral-Small-Instruct-2409",     # Mistral 22b 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!

    "unsloth/Llama-3.2-1B-bnb-4bit",           # NEW! Llama 3.2 models
    "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-3B-bnb-4bit",
    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",

    "unsloth/Llama-3.3-70B-Instruct-bnb-4bit" # NEW! Llama 3.3 70B!
]

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-1B-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,

)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.4.7: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.03G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # 0 is optimized
    bias = "none",    # "none" is optimized
    # "unsloth" uses 30% less VRAM, fits 2x larger batch sizes
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # rank stabilized LoRA is set to false
    loftq_config = None, # And L
)

Unsloth 2025.4.7 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
from datasets import load_dataset
txt_path = '/content/drive/MyDrive/HPML_Project/dataset/finance_corpus.txt'

# load the finance_corpus.txt
ds = load_dataset(
    "text",
    data_files={"train": txt_path},
    split="train",
)
ds = ds.filter(lambda x: x["text"].strip() != "")

print(f"Loaded {len(ds)} examples; sample text:")
print(ds[0]["text"][:200].replace("\n"," "), "…")

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/441891 [00:00<?, ? examples/s]

Loaded 227699 examples; sample text:
Article 1: Warren Buffett Autographed Books To Help Charity: Here's How You Can Get Legendary Investor's Signature …


In [6]:
split = ds.train_test_split(test_size=0.10, seed=42)
train_ds = split["train"]
val_ds   = split["test"]

In [7]:
import re
from datasets import Dataset

articles, buf = [], []

with open(txt_path, encoding="utf‑8") as f:
    for line in f:
        # new article header?
        if re.match(r"^Article\s+\d+:", line):
            if buf:
                articles.append(" ".join(buf).strip())
                buf = []
        buf.append(line.strip())
    if buf:
        articles.append(" ".join(buf).strip())

print("Total articles:", len(articles))
ds = Dataset.from_dict({"text": articles})

Total articles: 9285


In [8]:
split = ds.train_test_split(test_size=0.10, seed=42)
train_ds = split["train"]
val_ds   = split["test"]

In [9]:
train_ds

Dataset({
    features: ['text'],
    num_rows: 8356
})

In [10]:
for text in ds["text"][:10]:
    toks = tokenizer(text, add_special_tokens=False)
    print(len(toks["input_ids"]), "tokens")

670 tokens
799 tokens
696 tokens
554 tokens
797 tokens
813 tokens
728 tokens
757 tokens
962 tokens
974 tokens


In [11]:
from transformers import TrainerCallback
import math

class PerplexityCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if metrics is not None and "eval_loss" in metrics:
            ppl = math.exp(metrics["eval_loss"])
            print(f"Eval perplexity: {ppl:.2f}")

In [12]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq, DataCollatorForLanguageModeling
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_ds,
    eval_dataset     = val_ds,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,

    # data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False),
    callbacks = [PerplexityCallback()],
    dataset_num_proc = 2,
    packing = True, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 32,
        gradient_accumulation_steps = 1,
        warmup_steps = 5,
        num_train_epochs = 5, # Set this for 1 full training run.
        # max_steps = 60,
        eval_strategy = "epoch",
        learning_rate = 2e-5,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 5,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/8356 [00:00<?, ? examples/s]

Unsloth: Hugging Face's packing is currently buggy - we're disabling it for now!


Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/929 [00:00<?, ? examples/s]

Unsloth: Hugging Face's packing is currently buggy - we're disabling it for now!


In [13]:
tokenizer.decode(trainer.train_dataset[0]["input_ids"])



In [14]:
trainer.train_dataset

Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 8356
})

In [15]:
trainer_stats = trainer.train()


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 8,356 | Num Epochs = 5 | Total steps = 1,310
O^O/ \_/ \    Batch size per device = 32 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (32 x 1 x 1) = 32
 "-____-"     Trainable parameters = 11,272,192/1,000,000,000 (1.13% trained)


Epoch,Training Loss,Validation Loss
1,2.147,2.130299
2,1.9917,2.025411
3,1.9176,1.975027
4,1.8814,1.953023
5,1.9173,1.945981


Unsloth: Will smartly offload gradients to save VRAM!
Eval perplexity: 8.42
Eval perplexity: 7.58
Eval perplexity: 7.21
Eval perplexity: 7.05
Eval perplexity: 7.00


In [17]:
FastLanguageModel.for_inference(model)

def answer(prompt: str,
           max_new_tokens: int = 128,
           temperature: float    = 0.2,
           top_p: float          = 0.7,
           repetition_penalty: float = 1.2,
           no_repeat_ngram_size: int = 3):
    # 1) tokenize
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=512
    ).to(model.device)

    input_ids = inputs["input_ids"]

    # 2) generate with anti‐repetition tweaks
    outputs = model.generate(
        **inputs,
        max_new_tokens       = max_new_tokens,
        temperature          = temperature,
        top_p                = top_p,
        do_sample            = True,
        repetition_penalty   = repetition_penalty,
        no_repeat_ngram_size = no_repeat_ngram_size,
        eos_token_id         = tokenizer.eos_token_id,
        pad_token_id         = tokenizer.pad_token_id,
        early_stopping       = True,
    )

    # 3) strip off prompt‐tokens and decode only the new ones
    gen_ids = outputs[0][ input_ids.shape[-1] : ]
    return tokenizer.decode(gen_ids, skip_special_tokens=True).strip()

# Try it out
print(answer("How is starbucks doing?"))

Starbucks Corporation (NASDAQ:SBUX) has been on the market for a while now, and it's time to see how its performance compares with other stocks. The company recently reported earnings results that were in line with expectations but missed analysts' revenue estimates by 0%. This was partly due to higher costs associated with new stores opening up across different regions of the world.
Starbucks also announced plans to open more than 1,000 locations this year as part of an expansion strategy aimed at driving growth through innovation and customer experience improvements. However, these initiatives may have come too late given recent macroeconomic challenges such as rising inflation rates and trade


In [18]:
save_path = "/content/drive/MyDrive/HPML_Project/copy_unsloth_a100_6"   # <— adjust if your folder is nested

# 1) Save LoRA adapter + config
trainer.save_model(save_path)

# 2) Save tokenizer files
tokenizer.save_pretrained(save_path)

print(" Saved adapters + tokenizer to", save_path)

 Saved adapters + tokenizer to /content/drive/MyDrive/HPML_Project/copy_unsloth_a100_6


## Inference


In [19]:
# inference.py

import torch
from peft import prepare_model_for_kbit_training, PeftModel
from unsloth import FastLanguageModel

# 1) Load the same 4-bit base + tokenizer you fine-tuned on
base, tokenizer = FastLanguageModel.from_pretrained(
    model_name     = "unsloth/Llama-3.2-1B-bnb-4bit",  # your base
    max_seq_length = 128,
    dtype          = torch.float16,                   # or None for auto
    load_in_4bit   = True,
    device_map     = "auto",
)

# ensure pad/eos tokens are set
tokenizer.pad_token = tokenizer.eos_token
base.config.pad_token_id = tokenizer.pad_token_id
base.config.use_cache      = True

# 2) Patch for QLoRA / k-bit adapters
base = prepare_model_for_kbit_training(base)

# 3) Load your fine-tuned LoRA adapters
model = PeftModel.from_pretrained(
    base,
    "/content/drive/MyDrive/HPML_Project/copy_unsloth_a100_6",     # folder where you saved adapters + tokenizer
    device_map="auto",          # shard onto GPU automatically
)

# model.eval()
FastLanguageModel.for_inference(model)

# 4) Inference helper
def answer(prompt: str,
           max_new_tokens: int = 128,
           temperature: float    = 0.2,
           top_p: float          = 0.7,
           repetition_penalty: float = 1.2,
           no_repeat_ngram_size: int = 3):
    # 1) tokenize
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=512
    ).to(model.device)

    input_ids = inputs["input_ids"]

    # 2) generate with anti‐repetition tweaks
    outputs = model.generate(
        **inputs,
        max_new_tokens       = max_new_tokens,
        temperature          = temperature,
        top_p                = top_p,
        do_sample            = True,
        repetition_penalty   = repetition_penalty,
        no_repeat_ngram_size = no_repeat_ngram_size,
        eos_token_id         = tokenizer.eos_token_id,
        pad_token_id         = tokenizer.pad_token_id,
        early_stopping       = True,
    )

    # 3) strip off prompt‐tokens and decode only the new ones
    gen_ids = outputs[0][ input_ids.shape[-1] : ]
    return tokenizer.decode(gen_ids, skip_special_tokens=True).strip()

# Try it out
print(answer("How is starbucks stock doing?"))

==((====))==  Unsloth 2025.4.7: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Starbucks (SBUX) Stock Price and Report Card: A Look at the Market’s Top Trending Stocks. The market has been experiencing a period of volatility, with several stocks witnessing significant price movements in recent days. While some have experienced declines, others have seen gains or remain unchanged from their previous levels.
As investors assess these fluctuations amid broader economic uncertainty surrounding tariffs and trade tensions between China and other countries, it remains crucial to consider how individual companie

In [20]:
# inference.py

import torch
from peft import prepare_model_for_kbit_training, PeftModel
from unsloth import FastLanguageModel

# 1) Load the same 4-bit base + tokenizer you fine-tuned on
base, tokenizer = FastLanguageModel.from_pretrained(
    model_name     = "unsloth/Llama-3.2-1B-bnb-4bit",  # your base
    max_seq_length = 128,
    dtype          = torch.float16,                   # or None for auto
    load_in_4bit   = True,
    device_map     = "auto",
)

# ensure pad/eos tokens are set
tokenizer.pad_token = tokenizer.eos_token
base.config.pad_token_id = tokenizer.pad_token_id
base.config.use_cache      = True

# 2) Patch for QLoRA / k-bit adapters
base = prepare_model_for_kbit_training(base)

# 3) Load your fine-tuned LoRA adapters
model = PeftModel.from_pretrained(
    base,
    "/content/drive/MyDrive/HPML_Project/copy_unsloth_a100_6",     # folder where you saved adapters + tokenizer
    device_map="auto",          # shard onto GPU automatically
)

# model.eval()
FastLanguageModel.for_inference(model)

# 4) Inference helper
def answer(prompt: str,
           max_new_tokens: int = 128,
           temperature: float    = 0.2,
           top_p: float          = 0.9,
           repetition_penalty: float = 1.2,
           no_repeat_ngram_size: int = 3):
    # 1) tokenize
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=512
    ).to(model.device)

    input_ids = inputs["input_ids"]

    # 2) generate with anti‐repetition tweaks
    outputs = model.generate(
        **inputs,
        max_new_tokens       = max_new_tokens,
        temperature          = temperature,
        top_p                = top_p,
        do_sample            = True,
        repetition_penalty   = repetition_penalty,
        no_repeat_ngram_size = no_repeat_ngram_size,
        eos_token_id         = tokenizer.eos_token_id,
        pad_token_id         = tokenizer.pad_token_id,
        early_stopping       = True,
    )

    # 3) strip off prompt‐tokens and decode only the new ones
    gen_ids = outputs[0][ input_ids.shape[-1] : ]
    return tokenizer.decode(gen_ids, skip_special_tokens=True).strip()

# Try it out
print(answer("What is the best performing stock?"))

==((====))==  Unsloth 2025.4.7: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
What stocks are trading at all-time highs and why do they matter for your portfolio. These questions have been on my mind lately as I’ve tried to figure out what’s going on with these companies, but it seems like there isn’t a clear answer.
The first thing you need to know about this question is that no one knows exactly how much money will be made from investing in any given company or sector over time. That means we can never really predict whether something will go up or down based solely off of its current price point alon

In [21]:
# inference.py

import torch
from peft import prepare_model_for_kbit_training, PeftModel
from unsloth import FastLanguageModel

# 1) Load the same 4-bit base + tokenizer you fine-tuned on
base, tokenizer = FastLanguageModel.from_pretrained(
    model_name     = "unsloth/Llama-3.2-1B-bnb-4bit",  # your base
    max_seq_length = 128,
    dtype          = torch.float16,                   # or None for auto
    load_in_4bit   = True,
    device_map     = "auto",
)

# ensure pad/eos tokens are set
tokenizer.pad_token = tokenizer.eos_token
base.config.pad_token_id = tokenizer.pad_token_id
base.config.use_cache      = True

# 2) Patch for QLoRA / k-bit adapters
base = prepare_model_for_kbit_training(base)

# 3) Load your fine-tuned LoRA adapters
model = PeftModel.from_pretrained(
    base,
    "/content/drive/MyDrive/HPML_Project/copy_unsloth_a100_6",     # folder where you saved adapters + tokenizer
    device_map="auto",          # shard onto GPU automatically
)

# model.eval()
FastLanguageModel.for_inference(model)

# 4) Inference helper
def answer(prompt: str,
           max_new_tokens: int = 128,
           temperature: float    = 0.2,
           top_p: float          = 0.9,
           repetition_penalty: float = 1.2,
           no_repeat_ngram_size: int = 3):
    # 1) tokenize
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=512
    ).to(model.device)

    input_ids = inputs["input_ids"]

    # 2) generate with anti‐repetition tweaks
    outputs = model.generate(
        **inputs,
        max_new_tokens       = max_new_tokens,
        temperature          = temperature,
        top_p                = top_p,
        do_sample            = True,
        repetition_penalty   = repetition_penalty,
        no_repeat_ngram_size = no_repeat_ngram_size,
        eos_token_id         = tokenizer.eos_token_id,
        pad_token_id         = tokenizer.pad_token_id,
        early_stopping       = True,
    )

    # 3) strip off prompt‐tokens and decode only the new ones
    gen_ids = outputs[0][ input_ids.shape[-1] : ]
    return tokenizer.decode(gen_ids, skip_special_tokens=True).strip()

# Try it out
print(answer("What is the news on Lockheed Martin?"))

==((====))==  Unsloth 2025.4.7: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
The company has been in a bit of turmoil lately, with its stock down more than 20% since reporting earnings last month. But there are some positive developments that could help boost investor confidence and support share prices.
Firstly, CEO Marillyn Hewson announced plans to retire at the end of this year after nearly four decades leading the aerospace giant. This move comes as part of an ongoing succession plan aimed at ensuring continuity for future leadership decisions. It also signals potential changes within management r