# Import required libraries

In [None]:
import json
import torch
from datasets import Dataset
from transformers import Trainer, TrainingArguments
from unsloth import FastLanguageModel
from typing import Dict, Any
import os
from huggingface_hub import login
import warnings
warnings.filterwarnings('ignore')


# Configure basic settings

In [3]:
MODEL_NAME = "unsloth/Llama-3.2-3B-Instruct"
DATASET_PATH = "alpaca_dataset/alpaca_news_summaries_latest.json"
OUTPUT_DIR = "trained_news_summarizer"
MAX_SEQ_LENGTH = 2048

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load dataset

In [None]:
# Load the dataset
def load_dataset(dataset_path):
    with open(dataset_path, 'r', encoding='utf-8') as f:
        raw_data = json.load(f)
    
    # Convert to HuggingFace Dataset format
    dataset = Dataset.from_dict({
        'instruction': [item['instruction'] for item in raw_data['data']],
        'input': [item['input'] for item in raw_data['data']],
        'output': [item['output'] for item in raw_data['data']]
    })
    
    print(f"Loaded dataset with {len(dataset)} examples")
    return dataset

dataset = load_dataset(DATASET_PATH)

# Initialize model and tokenizer

In [None]:
print("Loading model and tokenizer...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    dtype=None,
    load_in_4bit=True
)


# Apply LoRA configuration
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    lora_alpha=16,
    lora_dropout=0,
    bias='none',
    use_gradient_checkpointing="unsloth",
    random_state=42069,
    use_rslora=False,
    loftq_config=None
)

print("Model loaded and LoRA configured")

# Define preprocessing function

In [6]:
def preprocess_function(examples):
    # Combine instruction, input, and output
    prompts = [
        f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n{output}"
        for instruction, input_text, output in zip(
            examples['instruction'],
            examples['input'],
            examples['output']
        )
    ]
    
    # Tokenize with padding and truncation
    model_inputs = tokenizer(
        prompts,
        max_length=MAX_SEQ_LENGTH,
        padding="max_length",
        truncation=True,
        return_tensors=None  # Return as lists, not tensors
    )
    
    # Create attention masks
    model_inputs["attention_mask"] = [
        [1 if token != tokenizer.pad_token_id else 0 for token in input_ids]
        for input_ids in model_inputs["input_ids"]
    ]
    
    # Add labels for training
    model_inputs["labels"] = model_inputs["input_ids"].copy()
    
    return model_inputs


# Process the dataset

In [None]:
print("Processing dataset...")
processed_dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset.column_names,
    desc="Preprocessing dataset"
)

# Convert dataset format
processed_dataset.set_format(
    type="torch", 
    columns=["input_ids", "attention_mask", "labels"]
)

print("Dataset processing completed")

# Create data collator

In [8]:
class CustomDataCollator:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        
    def __call__(self, features):
        batch = {
            'input_ids': torch.stack([f['input_ids'] for f in features]),
            'attention_mask': torch.stack([f['attention_mask'] for f in features]),
            'labels': torch.stack([f['labels'] for f in features])
        }
        return batch

data_collator = CustomDataCollator(tokenizer)


# Define training arguments

In [9]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    warmup_steps=50,
    fp16=True,
    logging_steps=10,
    save_strategy="epoch",
    evaluation_strategy="no",
    save_total_limit=2,
    remove_unused_columns=False,
    push_to_hub=False,
    report_to="tensorboard"  # Enable TensorBoard logging
)


# Initialize trainer

In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset,
    data_collator=data_collator,
)

# Start training

In [None]:
print("Starting training...")
trainer.train()


# Save the final model

In [None]:
print("Saving model...")
trainer.save_model()
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"Model saved to {OUTPUT_DIR}")


## Optional: Test the model

In [None]:
def generate_summary(article_text, max_length=512):
    prompt = f"### Instruction:\nSummarize the following article in bullet points:\n\n### Input:\n{article_text}\n\n### Response:\n"
    
    inputs = tokenizer(prompt, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"].to(model.device),
            max_length=max_length,
            num_return_sequences=1,
            temperature=0.7,
            do_sample=True
        )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test with a sample article
sample_article = "Your test article text here..."
generated_summary = generate_summary(sample_article)
print("Generated Summary:")
print(generated_summary)