# CTIScraper Fine-tuning on Colab T4 GPU

This notebook runs fine-tuning on Google Colab's free T4 GPU while integrating with your local UI.

## Setup Instructions:
1. Enable GPU: Runtime → Change runtime type → GPU → T4
2. Run all cells to set up the environment
3. Use your local UI to trigger training

In [None]:
# Install required packages
!pip install transformers torch datasets accelerate huggingface_hub
!pip install -q colabcode

In [None]:
# Import libraries
import sys
import os
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import Dataset
import torch
import json
from huggingface_hub import login

# Check GPU availability
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU device: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
# Setup Hugging Face Hub authentication
def setup_huggingface_hub():
    """Setup Hugging Face Hub authentication"""
    try:
        HF_TOKEN = "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"  # Your token
        login(token=HF_TOKEN)
        print("✅ Hugging Face Hub authentication successful")
        return True
    except Exception as e:
        print(f"❌ Hugging Face Hub setup failed: {e}")
        return False

setup_huggingface_hub()

In [None]:
# Fine-tuning trainer class
class FineTuningTrainer:
    def __init__(self, device="auto"):
        self.device = device
        self.model = None
        self.tokenizer = None
        
    def load_model(self, model_name):
        """Load model and tokenizer"""
        print(f"Loading model: {model_name}")
        
        # Load tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        
        # Load model
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
            device_map="auto" if torch.cuda.is_available() else None
        )
        
        print(f"✅ Model loaded successfully")
        return self.model, self.tokenizer
    
    def create_training_data(self, training_data):
        """Create training dataset from input data"""
        def tokenize_function(examples):
            # Format: "<|user|>\n{input}<|end|>\n<|assistant|>\n{output}<|end|>"
            texts = []
            for input_text, output_text in zip(examples['input'], examples['output']):
                formatted_text = f"<|user|>\n{input_text}<|end|>\n<|assistant|>\n{output_text}<|end|>"
                texts.append(formatted_text)
            
            tokenized = self.tokenizer(
                texts,
                truncation=True,
                padding=True,
                max_length=512,
                return_tensors="pt"
            )
            
            # For causal LM, labels are the same as input_ids
            tokenized['labels'] = tokenized['input_ids'].clone()
            return tokenized
        
        # Convert to Hugging Face dataset
        dataset = Dataset.from_dict({
            'input': [item['input'] for item in training_data],
            'output': [item['output'] for item in training_data]
        })
        
        tokenized_dataset = dataset.map(tokenize_function, batched=True)
        return tokenized_dataset
    
    def fine_tune_model(self, model_name, training_data, epochs=3, learning_rate=5e-5, 
                       output_dir="./models", push_to_hub=False, hub_model_id=None):
        """Fine-tune the model"""
        print(f"🚀 Starting fine-tuning on Colab T4 GPU")
        print(f"Model: {model_name}")
        print(f"Training examples: {len(training_data)}")
        print(f"Epochs: {epochs}")
        print(f"Learning rate: {learning_rate}")
        
        # Load model
        model, tokenizer = self.load_model(model_name)
        
        # Create training data
        train_dataset = self.create_training_data(training_data)
        
        # Training arguments optimized for T4
        training_args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=epochs,
            per_device_train_batch_size=2,  # Small batch for T4
            gradient_accumulation_steps=4,  # Effective batch size = 8
            learning_rate=learning_rate,
            warmup_steps=100,
            logging_steps=10,
            save_steps=500,
            evaluation_strategy="no",
            save_strategy="steps",
            load_best_model_at_end=False,
            report_to=None,  # Disable wandb
            fp16=torch.cuda.is_available(),  # Use mixed precision on GPU
            dataloader_pin_memory=False,
            remove_unused_columns=False
        )
        
        # Create trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            tokenizer=tokenizer
        )
        
        # Start training
        print("🔥 Training started on T4 GPU...")
        trainer.train()
        
        # Save model
        trainer.save_model()
        tokenizer.save_pretrained(output_dir)
        print(f"✅ Model saved to {output_dir}")
        
        # Push to Hugging Face Hub if requested
        if push_to_hub and hub_model_id:
            print(f"📤 Pushing model to Hugging Face Hub: {hub_model_id}")
            trainer.push_to_hub(hub_model_id)
            print(f"✅ Model pushed to: https://huggingface.co/{hub_model_id}")
        
        return trainer, output_dir

# Initialize trainer
trainer = FineTuningTrainer(device="auto")
print("✅ FineTuningTrainer initialized")

In [None]:
# Example: Manual training trigger
# Replace this with your actual training data
training_data = [
    {
        "input": "What is a phishing attack?",
        "output": "A phishing attack is a type of social engineering attack where attackers impersonate legitimate entities to trick victims into revealing sensitive information."
    },
    {
        "input": "How do I detect malware?",
        "output": "You can detect malware through behavioral analysis, signature-based detection, heuristic analysis, and monitoring for suspicious network traffic patterns."
    },
    {
        "input": "What is threat hunting?",
        "output": "Threat hunting is the proactive search for threats and malicious activity within an organization's network that may have evaded existing security controls."
    }
]

# Start training
model_name = "microsoft/Phi-3-mini-4k-instruct"
hub_model_id = "dfirtnt/test-cti-model-colab"

trainer_instance, output_path = trainer.fine_tune_model(
    model_name=model_name,
    training_data=training_data,
    epochs=1,  # Quick test
    learning_rate=5e-5,
    output_dir="./models/colab_test",
    push_to_hub=True,
    hub_model_id=hub_model_id
)

print("🎉 Training completed successfully!")