## History Chatbot - Unsloth Fine-Tuning Documentation
Complete cell-by-cell documentation for fine-tuning Phi-3-mini on Wikipedia history articles using Unsloth and LoRA.

Overview
Model: Phi-3-mini (3.8B parameters)
Method: LoRA Fine-Tuning
Dataset: ~25,000 History Q&A pairs from Wikipedia
Training Time: 2-3 hours on T4 GPU
Memory Required: 4-5GB VRAM

Set up the required Python libraries for efficient LLM fine-tuning.

In [None]:
!pip install -q unsloth trl peft accelerate bitsandbytes
!pip install -q datasets

Verify CUDA availability and displays GPU specifications to ensure proper hardware setup.


In [None]:
import torch
print(f"🔧 CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"   GPU: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

### Create History Q&A Dataset from Wikipedia
This Generates conversational Q&A pairs from Wikipedia history articles to create a specialized training dataset.

In [None]:
import json
import random
from datasets import load_dataset

print("📥 Loading Wikipedia dataset...")
# Using WikiAuto which contains Wikipedia content
wiki_data = load_dataset("GEM/wiki_auto_asset_turk", split="train")

def extract_history_content(text):
    """Check if content is history-related"""
    history_keywords = [
        'war', 'century', 'ancient', 'empire', 'kingdom', 'battle', 'revolution',
        'king', 'queen', 'emperor', 'dynasty', 'civilization', 'historical',
        'medieval', 'colonial', 'independence', 'treaty', 'conquest', 'reign',
        'founded', 'abolished', 'established', 'era', 'period', 'age',
        'BC', 'AD', 'BCE', 'CE', 'year', 'born', 'died', 'ruled'
    ]
    text_lower = text.lower()
    return any(keyword in text_lower for keyword in history_keywords)

def create_history_qa_dataset(num_samples=5000):
    """
    Create conversational Q&A pairs focused on history topics
    """
    print("🔄 Creating history Q&A dataset...")

    qa_pairs = []

    # Question templates for history
    history_questions = [
        "Tell me about {}",
        "What happened during {}?",
        "Explain the history of {}",
        "What do you know about {}?",
        "Can you describe {}?",
        "Give me information about {}",
        "What was {}?",
        "Who was involved in {}?",
    ]

    # Follow-up style questions
    follow_up_questions = [
        "Can you explain this historical event?",
        "Tell me more about this period in history.",
        "What happened here?",
        "Explain this to me.",
        "What's the significance of this?",
        "Can you summarize this historical information?",
        "What are the key facts about this?",
    ]

    processed = 0

    for example in wiki_data.select(range(min(len(wiki_data), 20000))):
        source = example['source']
        target = example['target']

        # Skip non-history content and short texts
        if not source or not target or len(source) < 50:
            continue

        if not extract_history_content(source):
            continue

        processed += 1

        # Extract topic (first 50 chars as context)
        topic_snippet = source[:50].strip()
        if len(topic_snippet) > 45:
            topic_snippet = topic_snippet[:45] + "..."

        # Question with topic context (2 variations)
        for template in random.sample(history_questions, 2):
            qa_pairs.append({
                "input": template.format(topic_snippet),
                "output": target
            })

        # Follow-up questions with context (2 variations)
        for question in random.sample(follow_up_questions, 2):
            qa_pairs.append({
                "input": f"Context: {source[:200]}\n\nQuestion: {question}",
                "output": target
            })

        # Direct explanation request
        qa_pairs.append({
            "input": f"Explain this historical text: {source[:300]}",
            "output": target
        })

        if processed % 500 == 0:
            print(f"   Processed {processed} history articles, created {len(qa_pairs)} Q&A pairs...")

        if processed >= num_samples:
            break

    print(f"✅ Created {len(qa_pairs)} history Q&A pairs from {processed} articles")
    return qa_pairs

# Create the dataset
history_qa = create_history_qa_dataset(num_samples=5000)

# Save to JSON file
with open("history_qa_dataset.json", "w") as f:
    json.dump(history_qa, f, indent=2)

print(f"\n💾 Dataset saved to 'history_qa_dataset.json'")
print(f"📊 Total Q&A pairs: {len(history_qa)}")

# Display sample
print("\n📝 Sample Q&A pairs:")
for i in range(3):
    print(f"\n--- Example {i+1} ---")
    print(f"Q: {history_qa[i]['input'][:150]}...")
    print(f"A: {history_qa[i]['output'][:150]}...")


Load the pre-trained Phi-3-mini model with 4-bit quantization for memory-efficient training.

In [None]:
from unsloth import FastLanguageModel

model_name = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit"
max_seq_length = 2048
dtype = None  # Auto detection

print(f"\n📦 Loading model: {model_name}")

# Load model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=True,
)


This Converts raw Q&A pairs into Phi-3's specific chat template format required for training.

In [None]:
from datasets import Dataset

def format_prompt(example):
    """Format as instruction-response pairs"""
    return f"""<|user|>
{example['input']}<|end|>
<|assistant|>
{example['output']}<|end|>"""

formatted_data = [format_prompt(item) for item in history_qa]
dataset = Dataset.from_dict({"text": formatted_data})

print(f"✅ Dataset formatted: {len(dataset)} training examples")

# Split into train and validation
train_test_split = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

print(f"   Train: {len(train_dataset)}")
print(f"   Validation: {len(eval_dataset)}")


### Adding LoRA Adapters

In [None]:

model = FastLanguageModel.get_peft_model(
    model,
    r=32,  # LoRA rank (reduced for faster training)
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha=64,  # LoRA scaling factor (2x rank)
    lora_dropout=0,  # No dropout for optimization
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

print("✅ LoRA adapters added")

### Setting up training arguments and training the model

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    args=TrainingArguments(
        per_device_train_batch_size=4,  # Increased from 2
        per_device_eval_batch_size=8,
        gradient_accumulation_steps=4,  # Effective batch size = 16
        warmup_steps=50,
        num_train_epochs=3,
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=50,
        eval_strategy="steps",
        eval_steps=200,
        save_strategy="steps",
        save_steps=200,
        save_total_limit=2,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="cosine",
        seed=3407,
        output_dir="history_chatbot_outputs",
        dataloader_pin_memory=False,
        report_to="none",
        load_best_model_at_end=True,
        metric_for_best_model="loss",
    ),
)

print("✅ Trainer configured")

print("\n🚀 Starting training...")
import time
start_time = time.time()

trainer_stats = trainer.train()

elapsed_time = (time.time() - start_time) / 60
print(f"\n🎉 Training complete in {elapsed_time:.1f} minutes!")


### Test functionality of the modal

In [None]:
FastLanguageModel.for_inference(model)  # Enable faster inference

print("\n🧪 Testing the model...\n")

test_questions = [
    "Tell me about the Roman Empire",
    "What happened during World War II?",
    "Explain the French Revolution",
    "Who was Alexander the Great?",
    "What was the Renaissance period?",
    "Tell me about ancient Egypt",
]

for question in test_questions:
    messages = [
        {"role": "user", "content": question},
    ]

    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to("cuda")

    outputs = model.generate(
        input_ids=inputs,
        max_new_tokens=256,
        use_cache=True,
        temperature=0.7,
        do_sample=True,
        top_p=0.9,
        repetition_penalty=1.1,
    )

    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

    # Extract assistant's response
    if "<|assistant|>" in response:
        response = response.split("<|assistant|>")[-1].strip()

    print(f"Q: {question}")
    print(f"A: {response}\n")
    print("-" * 80)

### Interactive Chat Interface

In [None]:
print("\n" + "="*80)
print("🤖 HISTORY CHATBOT - Interactive Mode")
print("="*80)
print("Commands: 'quit' to exit")
print("="*80 + "\n")

while True:
    try:
        user_input = input("You: ").strip()

        if user_input.lower() in ['quit', 'exit', 'q']:
            print("👋 Goodbye!")
            break

        if not user_input:
            continue

        messages = [{"role": "user", "content": user_input}]

        inputs = tokenizer.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=True,
            return_tensors="pt",
        ).to("cuda")

        outputs = model.generate(
            input_ids=inputs,
            max_new_tokens=256,
            use_cache=True,
            temperature=0.7,
            do_sample=True,
            top_p=0.9,
            repetition_penalty=1.1,
        )

        response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

        # Extract assistant's response
        if "<|assistant|>" in response:
            response = response.split("<|assistant|>")[-1].strip()

        print(f"Bot: {response}\n")

    except KeyboardInterrupt:
        print("\n👋 Goodbye!")
        break

In [None]:
def calculate_metrics(model, tokenizer, test_questions):
    total_response_length = 0
    num_questions = len(test_questions)

    for question in test_questions:
        messages = [{"role": "user", "content": question}]
        inputs = tokenizer.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=True,
            return_tensors="pt",
        ).to("cuda")

        outputs = model.generate(
            input_ids=inputs,
            max_new_tokens=256,
            use_cache=True,
            temperature=0.7,
            do_sample=True,
            top_p=0.9,
            repetition_penalty=1.1,
        )
        response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

        if "<|assistant|>" in response:
            response = response.split("<|assistant|>")[-1].strip()

        total_response_length += len(response)

    avg_response_length = total_response_length / num_questions if num_questions > 0 else 0
    return {"average_response_length": avg_response_length}

if 'model' in locals() and 'tokenizer' in locals() and 'test_questions' in locals():
    metrics = calculate_metrics(model, tokenizer, test_questions)
    print(f"   Average Response Length: {metrics['average_response_length']:.2f}")
else:
    print("   Model, tokenizer, or test_questions not found. Please run previous cells.")

print("✅ Performance evaluation complete")

### Save Model in GGUF format

In [None]:
print("\n💾 Saving model...")

# Save as GGUF for easy deployment
model.save_pretrained_gguf("history_chatbot_gguf", tokenizer, quantization_method="q4_k_m")
print("✅ Model saved in GGUF format")

# Save full model
model.save_pretrained("history_chatbot_model")
tokenizer.save_pretrained("history_chatbot_model")
print("✅ Model saved in HuggingFace format")