In [1]:
# !pip install unsloth
# !pip install accelerate
# !pip install bitsandbytes
# !pip install transformers>=4.37.2
# !pip install sentencepiece

# Import required libraries

In [2]:
import json
import torch
from datasets import Dataset
from transformers import Trainer, TrainingArguments
from unsloth import FastLanguageModel
from typing import Dict, Any
import os
from huggingface_hub import login
import warnings
warnings.filterwarnings('ignore')


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


# Configure basic settings

In [3]:
MODEL_NAME = "unsloth/Llama-3.2-3B-Instruct"
DATASET_PATH = "alpaca_dataset/alpaca_news_summaries_latest.json"
OUTPUT_DIR = "trained_news_summarizer"
MAX_SEQ_LENGTH = 2048

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load dataset

In [4]:
# Load the dataset
def load_dataset(dataset_path):
    with open(dataset_path, 'r', encoding='utf-8') as f:
        raw_data = json.load(f)
    
    # Convert to HuggingFace Dataset format
    dataset = Dataset.from_dict({
        'instruction': [item['instruction'] for item in raw_data['data']],
        'input': [item['input'] for item in raw_data['data']],
        'output': [item['output'] for item in raw_data['data']]
    })
    
    print(f"Loaded dataset with {len(dataset)} examples")
    return dataset

dataset = load_dataset(DATASET_PATH)

Loaded dataset with 398 examples


# Initialize model and tokenizer

In [5]:
print("Loading model and tokenizer...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    dtype=None,
    load_in_4bit=True
)


# Apply LoRA configuration
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    lora_alpha=16,
    lora_dropout=0,
    bias='none',
    use_gradient_checkpointing="unsloth",
    random_state=42069,
    use_rslora=False,
    loftq_config=None
)

print("Model loaded and LoRA configured")

Loading model and tokenizer...
==((====))==  Unsloth 2024.10.7: Fast Llama patching. Transformers = 4.46.0.
   \\   /|    GPU: NVIDIA GeForce RTX 4070. Max memory: 11.616 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.0+cu124. CUDA = 8.9. CUDA Toolkit = 12.4.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unsloth 2024.10.7 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


Model loaded and LoRA configured


# Define preprocessing function

In [6]:
def preprocess_function(examples):
    # Combine instruction, input, and output
    prompts = [
        f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n{output}"
        for instruction, input_text, output in zip(
            examples['instruction'],
            examples['input'],
            examples['output']
        )
    ]
    
    # Tokenize with padding and truncation
    model_inputs = tokenizer(
        prompts,
        max_length=MAX_SEQ_LENGTH,
        padding="max_length",
        truncation=True,
        return_tensors=None  # Return as lists, not tensors
    )
    
    # Create attention masks
    model_inputs["attention_mask"] = [
        [1 if token != tokenizer.pad_token_id else 0 for token in input_ids]
        for input_ids in model_inputs["input_ids"]
    ]
    
    # Add labels for training
    model_inputs["labels"] = model_inputs["input_ids"].copy()
    
    return model_inputs


# Process the dataset

In [7]:
print("Processing dataset...")
processed_dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset.column_names,
    desc="Preprocessing dataset"
)

# Convert dataset format
processed_dataset.set_format(
    type="torch", 
    columns=["input_ids", "attention_mask", "labels"]
)

print("Dataset processing completed")

Processing dataset...


Preprocessing dataset: 100%|██████████| 398/398 [00:00<00:00, 795.87 examples/s]

Dataset processing completed





# Create data collator

In [8]:
class CustomDataCollator:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        
    def __call__(self, features):
        batch = {
            'input_ids': torch.stack([f['input_ids'] for f in features]),
            'attention_mask': torch.stack([f['attention_mask'] for f in features]),
            'labels': torch.stack([f['labels'] for f in features])
        }
        return batch

data_collator = CustomDataCollator(tokenizer)


# Define training arguments

In [9]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    warmup_steps=50,
    fp16=True,
    logging_steps=10,
    save_strategy="epoch",
    evaluation_strategy="no",
    save_total_limit=2,
    remove_unused_columns=False,
    push_to_hub=False,
    report_to="tensorboard"  # Enable TensorBoard logging
)


# Initialize trainer

In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset,
    data_collator=data_collator,
)

# Start training

In [11]:
print("Starting training...")
trainer.train()


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 398 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 4
\        /    Total batch size = 16 | Total steps = 75
 "-____-"     Number of trainable parameters = 24,313,856


Starting training...


  0%|          | 0/75 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 3.91 GiB. GPU 0 has a total capacity of 11.62 GiB of which 3.66 GiB is free. Process 3912 has 154.75 MiB memory in use. Process 4647 has 1.42 GiB memory in use. Including non-PyTorch memory, this process has 5.73 GiB memory in use. Of the allocated memory 5.47 GiB is allocated by PyTorch, and 74.83 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

# Save the final model

In [None]:
print("Saving model...")
trainer.save_model()
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"Model saved to {OUTPUT_DIR}")


## Optional: Test the model

In [None]:
def generate_summary(article_text, max_length=512):
    prompt = f"### Instruction:\nSummarize the following article in bullet points:\n\n### Input:\n{article_text}\n\n### Response:\n"
    
    inputs = tokenizer(prompt, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"].to(model.device),
            max_length=max_length,
            num_return_sequences=1,
            temperature=0.7,
            do_sample=True
        )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test with a sample article
sample_article = "Your test article text here..."
generated_summary = generate_summary(sample_article)
print("Generated Summary:")
print(generated_summary)