In [4]:
import torch
torch.cuda.empty_cache()
import gc
gc.collect()

42066

In [None]:
# ============================================================================
# üéÉ TWO-SENTENCE HORROR STORY GENERATOR - MISTRAL 7B VERSION
# Fine-tuning Mistral 7B on Reddit r/TwoSentenceHorror with instruction format
# ============================================================================
# KAGGLE SETUP:
# 1. Settings ‚Üí Accelerator ‚Üí GPU T4 x2
# 2. Settings ‚Üí Persistence ‚Üí Files only
# 3. Add datasets: historical-two-sentence-horror-split
# 4. Run this entire notebook
# ============================================================================

# CRITICAL: Set this BEFORE any other imports!
import os
# Use BOTH GPUs for more memory!
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"  # Changed from just "0"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# ============================================================================
# üìã CONFIGURATION
# ============================================================================

# Which dataset part are you training on?
DATASET_PART = 1  # Start fresh with Part 1

# Resume from checkpoint?
CONTINUE_FROM_PREVIOUS = False  # Set True for Parts 2, 3, 4
PREVIOUS_MODEL_PATH = None  # Update when continuing training

# Output directory
OUTPUT_DIR = f"/kaggle/working/llama-horror-part{DATASET_PART}"

# Memory optimization
USE_FLASH_ATTENTION = False  # Disabled - not installed on Kaggle
MAX_SEQ_LENGTH = 384  # Increased since we have 2 GPUs now

# ============================================================================
# 1. INSTALL LIBRARIES
# ============================================================================
print("üì¶ Installing required libraries...")
!pip install -q --upgrade transformers datasets accelerate bitsandbytes peft trl

# ============================================================================
# 2. IMPORTS
# ============================================================================
import torch
import time
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import PeftModel, LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer

print(f"‚úÖ PyTorch version: {torch.__version__}")
print(f"‚úÖ CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    gpu_count = torch.cuda.device_count()
    print(f"‚úÖ GPU Count: {gpu_count}")
    for i in range(gpu_count):
        print(f"‚úÖ GPU {i}: {torch.cuda.get_device_name(i)}")
        print(f"   Memory: {torch.cuda.get_device_properties(i).total_memory / 1024**3:.1f} GB")

# ============================================================================
# 3. LOAD DATASET
# ============================================================================
print("\n" + "="*70)
print(f"üìä LOADING DATASET PART {DATASET_PART}")
print("="*70)

data_file_path = f"/kaggle/input/historical-two-sentence-horror-split/dataset_part_{DATASET_PART}.txt"

print(f"Loading from: {data_file_path}")
raw_dataset = load_dataset("text", data_files={"train": data_file_path}, split="train")

print(f"‚úÖ Loaded {len(raw_dataset)} stories")
print(f"\nüìñ Raw sample:")
print(raw_dataset[0]['text'][:200] + "...")

# ============================================================================
# 4. FORMAT DATASET WITH INSTRUCTION TEMPLATE
# ============================================================================
print("\n" + "="*70)
print("üîß FORMATTING DATASET WITH INSTRUCTION TEMPLATE")
print("="*70)

def format_instruction(example):
    """
    Convert raw horror story into Mistral instruction format
    This teaches the model to follow instructions, not just complete text
    """
    story = example['text'].strip()
    
    # Mistral instruction format
    formatted = f"""<s>[INST] Write a creative and chilling two-sentence horror story. [/INST] {story}</s>"""
    
    return {"text": formatted}

# Apply formatting to entire dataset
formatted_dataset = raw_dataset.map(format_instruction)

print(f"‚úÖ Formatted {len(formatted_dataset)} stories")
print(f"\nüìñ Formatted sample:")
print(formatted_dataset[0]['text'][:300] + "...")

# ============================================================================
# 5. LOAD MODEL
# ============================================================================
print("\n" + "="*70)
if CONTINUE_FROM_PREVIOUS:
    print(f"üîÑ LOADING PREVIOUS MODEL")
else:
    print(f"ü§ñ LOADING MISTRAL 7B INSTRUCT")
print("="*70)

model_id = "mistralai/Mistral-7B-Instruct-v0.3"

# 4-bit quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,  # bfloat16 for Llama
    bnb_4bit_use_double_quant=True,  # Extra memory savings
)

if CONTINUE_FROM_PREVIOUS and PREVIOUS_MODEL_PATH:
    print(f"\nüìÇ Loading previous model from: {PREVIOUS_MODEL_PATH}")
    
    # Load base model
    print("   Step 1/3: Loading base model...")
    base_model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=bnb_config,
        device_map="cuda:0",
        trust_remote_code=True,
        max_memory={0: "14GB"},
    )
    
    # Load LoRA adapter
    print("   Step 2/3: Loading LoRA adapter...")
    model = PeftModel.from_pretrained(
        base_model,
        PREVIOUS_MODEL_PATH,
        is_trainable=True,
    )
    
    print("   Step 3/3: Enabling training mode...")
    model.train()
    
    # Enable gradients
    for param in model.parameters():
        if param.requires_grad:
            param.data = param.data.to(torch.bfloat16)
    
    print(f"   ‚úÖ Previous model loaded")
    
else:
    print(f"\nüÜï Loading fresh {model_id}...")
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=bnb_config,
        device_map="auto",  # Automatically split across both GPUs!
        trust_remote_code=True,
        low_cpu_mem_usage=True,
    )
    
    print("‚úÖ Base model loaded")
    print(f"üìä Model device map: {model.hf_device_map}")  # Show which layers on which GPU

model.config.use_cache = False
model.config.pretraining_tp = 1

# Don't run prepare_model_for_kbit_training - it causes OOM
# The model is already prepared from quantization_config
print("‚úÖ Model prepared for training (skipping k-bit prep to save memory)")

# Load tokenizer
print("\nüìù Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print(f"‚úÖ Model size: ~{sum(p.numel() for p in model.parameters()) / 1e9:.2f}B parameters")

# ============================================================================
# 6. LORA CONFIGURATION
# ============================================================================
if not CONTINUE_FROM_PREVIOUS:
    print("\n" + "="*70)
    print("‚öôÔ∏è  CONFIGURING LORA")
    print("="*70)
    
    peft_config = LoraConfig(
        r=8,                          # Back to 8 since we have 2 GPUs now
        lora_alpha=16,                # 2x rank
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=[              # More modules now that we have memory
            "q_proj",
            "k_proj",
            "v_proj",
            "o_proj",
        ],
    )
    
    # Apply LoRA
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()
    
    print("‚úÖ LoRA config applied")
    print(f"   - Rank: {peft_config.r}")
    print(f"   - Alpha: {peft_config.lora_alpha}")
    print(f"   - Target modules: {len(peft_config.target_modules)} modules")
else:
    peft_config = None
    print("\n‚úÖ Using existing LoRA configuration from previous model")

# ============================================================================
# 7. TRAINING ARGUMENTS
# ============================================================================
print("\n" + "="*70)
print("üéØ TRAINING CONFIGURATION")
print("="*70)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=2,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,    # Back to 4 with 2 GPUs
    learning_rate=1e-4,
    fp16=False,
    bf16=True,
    optim="paged_adamw_8bit",
    logging_steps=25,
    save_steps=400,
    save_total_limit=3,
    report_to="none",
    warmup_steps=100,
    weight_decay=0.01,
    max_grad_norm=0.3,
    dataloader_pin_memory=False,
    group_by_length=True,
    gradient_checkpointing=True,      # Still keep this for safety
)

print("‚úÖ Training arguments set")
print(f"   - Dataset: Part {DATASET_PART} ({len(formatted_dataset)} stories)")
print(f"   - Epochs: {training_args.num_train_epochs}")
print(f"   - Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"   - Learning rate: {training_args.learning_rate}")
print(f"   - Save every: {training_args.save_steps} steps")
print(f"   - Max sequence length: {MAX_SEQ_LENGTH}")
print(f"   - Gradient checkpointing: ENABLED (saves memory)")
print(f"   - Output: {OUTPUT_DIR}")

# ============================================================================
# 8. INITIALIZE TRAINER
# ============================================================================
print("\n" + "="*70)
print("üöÄ INITIALIZING TRAINER")
print("="*70)

# Formatting function for SFTTrainer
def formatting_prompts_func(example):
    """Return the formatted text for training"""
    return example["text"]

trainer = SFTTrainer(
    model=model,
    train_dataset=formatted_dataset,
    args=training_args,
    peft_config=peft_config,
    formatting_func=formatting_prompts_func,
)

print("‚úÖ Trainer initialized")

# ============================================================================
# 9. TRAIN!
# ============================================================================
print("\n" + "="*70)
if CONTINUE_FROM_PREVIOUS:
    print(f"üî• CONTINUING TRAINING FROM PREVIOUS MODEL")
else:
    print(f"üî• STARTING FRESH TRAINING")
print(f"üìö Training on {len(formatted_dataset)} stories from Part {DATASET_PART}")
print("="*70)
print("‚è∞ Estimated time: ~45-60 minutes...")
print("üí° Checkpoints will be saved every 300 steps")
print()

start_time = time.time()

try:
    trainer.train()
    training_time = time.time() - start_time
    
    print("\n" + "="*70)
    print(f"‚úÖ TRAINING COMPLETE! Time: {training_time/60:.1f} minutes")
    print("="*70)
    
except Exception as e:
    training_time = time.time() - start_time
    print("\n" + "="*70)
    print(f"‚ùå TRAINING FAILED AFTER {training_time/60:.1f} minutes")
    print(f"Error: {e}")
    print("="*70)

# ============================================================================
# 10. SAVE FINAL MODEL
# ============================================================================
print("\n" + "="*70)
print("üíæ SAVING FINAL MODEL")
print("="*70)

final_output_dir = f"{OUTPUT_DIR}/final-model"
os.makedirs(final_output_dir, exist_ok=True)

try:
    trainer.model.save_pretrained(final_output_dir)
    print("‚úÖ Model saved")
except Exception as e:
    print(f"‚ö†Ô∏è Save failed: {e}")

tokenizer.save_pretrained(final_output_dir)
print("‚úÖ Tokenizer saved")

time.sleep(2)

# ============================================================================
# 11. VERIFY SAVES
# ============================================================================
print("\n" + "="*70)
print("üìÅ CHECKING SAVED FILES")
print("="*70)

if os.path.exists(OUTPUT_DIR):
    checkpoints = [d for d in os.listdir(OUTPUT_DIR) if d.startswith("checkpoint-")]
    if checkpoints:
        print(f"\n‚úÖ Found {len(checkpoints)} checkpoint(s):")
        for cp in sorted(checkpoints):
            print(f"   üìÇ {cp}")

if os.path.exists(final_output_dir):
    print(f"\n‚úÖ Final model directory: {final_output_dir}")
    for item in sorted(os.listdir(final_output_dir))[:10]:
        if os.path.isfile(os.path.join(final_output_dir, item)):
            size = os.path.getsize(os.path.join(final_output_dir, item))
            print(f"   üìÑ {item} ({size/1024/1024:.1f} MB)")

# ============================================================================
# 12. TEST THE MODEL
# ============================================================================
print("\n" + "="*70)
print("üß™ TESTING MODEL WITH SAMPLE PROMPTS")
print("="*70)

# Test prompts in proper instruction format
test_prompts = [
    "Write a creative and chilling two-sentence horror story about a mother and daughter.",
    "Write a creative and chilling two-sentence horror story about technology.",
    "Write a creative and chilling two-sentence horror story about being alone at night.",
]

tuned_model = trainer.model
tuned_model.eval()

for i, user_prompt in enumerate(test_prompts, 1):
    print(f"\n{'='*70}")
    print(f"Test {i}/{len(test_prompts)}")
    print(f"{'='*70}")
    print(f"üé≠ PROMPT: {user_prompt}\n")
    
    # Format in Mistral instruction template
    full_prompt = f"""<s>[INST] {user_prompt} [/INST]"""
    
    try:
        inputs = tokenizer(full_prompt, return_tensors="pt").to(tuned_model.device)
        
        with torch.no_grad():
            outputs = tuned_model.generate(
                **inputs,
                max_new_tokens=100,
                temperature=0.8,          # Lower for more coherent output
                top_p=0.92,
                do_sample=True,
                repetition_penalty=1.3,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id,
            )
        
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Extract just the assistant's response
        if "assistant" in generated_text:
            response = generated_text.split("assistant")[-1].strip()
            print(f"üëª GENERATED STORY:\n{response}\n")
        else:
            print(f"üëª GENERATED:\n{generated_text}\n")
            
    except Exception as e:
        print(f"‚ùå Generation failed: {e}\n")

# ============================================================================
# 13. FINAL SUMMARY
# ============================================================================
print("\n" + "="*70)
print("üéâ SUMMARY")
print("="*70)
print(f"‚úÖ Model: Mistral 7B Instruct")
print(f"‚úÖ Dataset: Part {DATASET_PART} ({len(formatted_dataset)} stories)")
print(f"‚úÖ Training mode: {'Continued from previous' if CONTINUE_FROM_PREVIOUS else 'Fresh training'}")
print(f"‚úÖ Output directory: {OUTPUT_DIR}")
print(f"‚úÖ Final model: {final_output_dir}")

print("\nüìù NEXT STEPS:")
print("   1. Click 'Save Version' to save this notebook")
print("   2. Create a dataset from the output")
print("   3. For next part, update:")
print(f"      - DATASET_PART = {DATASET_PART + 1}")
print(f"      - CONTINUE_FROM_PREVIOUS = True")
print(f"      - PREVIOUS_MODEL_PATH = '/kaggle/input/your-saved-model/final-model'")

print("\nüí° MISTRAL 7B ADVANTAGES:")
print("   ‚úì Fits comfortably on single T4 GPU")
print("   ‚úì Excellent instruction following")
print("   ‚úì More coherent than Phi-3")
print("   ‚úì Fast training (~45-60 min per part)")
print("   ‚úì Great balance of quality and efficiency")

print("\n" + "="*70)

üì¶ Installing required libraries...
‚úÖ PyTorch version: 2.6.0+cu124
‚úÖ CUDA available: True
‚úÖ GPU Count: 2
‚úÖ GPU 0: Tesla T4
   Memory: 14.7 GB
‚úÖ GPU 1: Tesla T4
   Memory: 14.7 GB

üìä LOADING DATASET PART 1
Loading from: /kaggle/input/historical-two-sentence-horror-split/dataset_part_1.txt
‚úÖ Loaded 20000 stories

üìñ Raw sample:
"Do not expose any part of your body to the air.". "I repeat..this is not a drill.."...

üîß FORMATTING DATASET WITH INSTRUCTION TEMPLATE
‚úÖ Formatted 20000 stories

üìñ Formatted sample:
<s>[INST] Write a creative and chilling two-sentence horror story. [/INST] "Do not expose any part of your body to the air.". "I repeat..this is not a drill.."</s>...

ü§ñ LOADING MISTRAL 7B INSTRUCT

üÜï Loading fresh mistralai/Mistral-7B-Instruct-v0.3...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

‚úÖ Base model loaded
üìä Model device map: {'model.embed_tokens': 0, 'model.layers.0': 0, 'model.layers.1': 0, 'model.layers.2': 0, 'model.layers.3': 0, 'model.layers.4': 0, 'model.layers.5': 0, 'model.layers.6': 0, 'model.layers.7': 0, 'model.layers.8': 0, 'model.layers.9': 0, 'model.layers.10': 0, 'model.layers.11': 0, 'model.layers.12': 0, 'model.layers.13': 0, 'model.layers.14': 0, 'model.layers.15': 0, 'model.layers.16': 0, 'model.layers.17': 0, 'model.layers.18': 0, 'model.layers.19': 0, 'model.layers.20': 0, 'model.layers.21': 0, 'model.layers.22': 0, 'model.layers.23': 0, 'model.layers.24': 0, 'model.layers.25': 0, 'model.layers.26': 0, 'model.layers.27': 0, 'model.layers.28': 0, 'model.layers.29': 0, 'model.layers.30': 1, 'model.layers.31': 1, 'model.norm': 1, 'model.rotary_emb': 1, 'lm_head': 1}
‚úÖ Model prepared for training (skipping k-bit prep to save memory)

üìù Loading tokenizer...
‚úÖ Model size: ~3.76B parameters

‚öôÔ∏è  CONFIGURING LORA
trainable params: 6,815,7



Applying formatting function to train dataset:   0%|          | 0/20000 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/20000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/20000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/20000 [00:00<?, ? examples/s]

‚úÖ Trainer initialized

üî• STARTING FRESH TRAINING
üìö Training on 20000 stories from Part 1
‚è∞ Estimated time: ~45-60 minutes...
üí° Checkpoints will be saved every 300 steps



  return fn(*args, **kwargs)


Step,Training Loss
