# 🛡️ Hancock Fine-Tuning — Kaggle GPU
**CyberViser** | Free: 30h/week T4 GPU on Kaggle

**Before running:**
1. Settings → Accelerator → **GPU T4 x2**
2. Settings → Internet → **On**
3. Add secrets: `HF_TOKEN` (optional)

> ⚠️ Add your HuggingFace token as a Kaggle Secret to push the trained model.

In [None]:
# Install dependencies
!pip install 'unsloth[kaggle-new] @ git+https://github.com/unslothai/unsloth.git' -q
!pip install trl transformers accelerate datasets peft bitsandbytes requests tqdm -q
print('✅ Done')

In [None]:
# Clone repo + build v3 dataset (CISA KEV + Atomic Red Team + GHSA)
!git clone https://github.com/cyberviser/Hancock.git
import os; os.chdir('Hancock')
!python hancock_pipeline.py --phase 3

from pathlib import Path
path = Path('data/hancock_v3.jsonl') if Path('data/hancock_v3.jsonl').exists() else Path('data/hancock_v2.jsonl')
lines = path.read_text().strip().splitlines()
print(f'\u2705 {len(lines):,} training samples ready from {path.name}')

In [None]:
import torch, json
from unsloth import FastLanguageModel
from datasets import Dataset
from trl import SFTTrainer
from transformers import TrainingArguments

model, tokenizer = FastLanguageModel.from_pretrained(
    'mistralai/Mistral-7B-Instruct-v0.3',
    max_seq_length=2048, dtype=None, load_in_4bit=True
)
model = FastLanguageModel.get_peft_model(
    model, r=32,
    target_modules=['q_proj','k_proj','v_proj','o_proj','gate_proj','up_proj','down_proj'],
    lora_alpha=32, lora_dropout=0.05, bias='none',
    use_gradient_checkpointing='unsloth', random_state=42
)
print('✅ Model + LoRA loaded')

In [None]:
from pathlib import Path
path = Path('data/hancock_v3.jsonl') if Path('data/hancock_v3.jsonl').exists() else Path('data/hancock_v2.jsonl')
raw = [json.loads(l) for l in path.read_text().strip().splitlines()]
texts = [tokenizer.apply_chat_template(s['messages'], tokenize=False, add_generation_prompt=False) for s in raw]
ds = Dataset.from_dict({'text': texts}).train_test_split(test_size=0.05, seed=42)
print(f'Train: {len(ds["train"]):,} | Eval: {len(ds["test"]):,}')

In [None]:
trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds['train'], eval_dataset=ds['test'],
    dataset_text_field='text', max_seq_length=2048, packing=True,
    args=TrainingArguments(
        per_device_train_batch_size=2, gradient_accumulation_steps=4,
        warmup_ratio=0.05, num_train_epochs=3, learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(), bf16=torch.cuda.is_bf16_supported(),
        logging_steps=20, save_strategy='epoch', output_dir='/kaggle/working/checkpoints',
        report_to='none', optim='adamw_8bit', weight_decay=0.01,
        lr_scheduler_type='cosine', seed=42,
    )
)
result = trainer.train()
print(f'✅ Training complete — loss: {result.training_loss:.4f}')

In [None]:
# Save + push to HuggingFace Hub
model.save_pretrained('/kaggle/working/hancock_lora')
tokenizer.save_pretrained('/kaggle/working/hancock_lora')
model.save_pretrained_gguf('/kaggle/working/hancock_gguf', tokenizer, quantization_method='q4_k_m')
print('✅ Saved to /kaggle/working/')

import os
hf_token = os.getenv('HF_TOKEN', '')
if hf_token:
    model.push_to_hub('cyberviser/hancock-mistral-7b-lora', token=hf_token)
    tokenizer.push_to_hub('cyberviser/hancock-mistral-7b-lora', token=hf_token)
    print('✅ Pushed to huggingface.co/cyberviser/hancock-mistral-7b-lora')
else:
    print('ℹ️  Add HF_TOKEN as Kaggle Secret to push to HuggingFace Hub')