# RWA Compliance AI - Model Fine-Tuning

This notebook demonstrates fine-tuning open-source LLMs for compliance tasks.

## Models
- **Mistral 7B** - Primary model for compliance reasoning
- **Legal-BERT** - For document classification

## Tasks
1. Jurisdiction Classification
2. Conflict Resolution
3. Document Generation

In [None]:
# Install dependencies (run once)
# !pip install torch transformers datasets peft accelerate bitsandbytes

In [None]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import Dataset
import json
from pathlib import Path

print(f'PyTorch version: {torch.__version__}')
print(f'CUDA available: {torch.cuda.is_available()}')
if torch.cuda.is_available():
    print(f'GPU: {torch.cuda.get_device_name(0)}')

## 1. Load Base Model with Quantization

In [None]:
MODEL_NAME = 'mistralai/Mistral-7B-v0.1'

# Quantization config for memory efficiency
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

print(f'Tokenizer loaded: {MODEL_NAME}')

In [None]:
# Load model with quantization
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map='auto',
    trust_remote_code=True
)

model = prepare_model_for_kbit_training(model)
print('Model loaded and prepared for training')

## 2. Configure LoRA

In [None]:
# LoRA configuration
lora_config = LoraConfig(
    r=16,                          # Rank
    lora_alpha=32,                 # Alpha scaling
    lora_dropout=0.05,             # Dropout
    target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj'],  # Attention layers
    bias='none',
    task_type='CAUSAL_LM'
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

## 3. Prepare Training Data

In [None]:
# Sample training data for jurisdiction classification
training_data = [
    {
        'instruction': 'Analyze the following document and determine the investor jurisdiction and classification.',
        'input': 'W-9 form submitted. SSN ending 5678. Address: 123 Main Street, New York, NY 10001. Annual income reported: $275,000 for 2023 and $260,000 for 2022.',
        'output': json.dumps({
            'jurisdiction': 'US',
            'entity_type': 'individual',
            'classification': 'accredited',
            'reasoning': 'US person (W-9 + NY address). Qualifies as accredited investor under SEC Rule 501(a) - income exceeds $200K threshold for 2 consecutive years.',
            'applicable_regulations': ['SEC Reg D 501(a)', 'FINRA Rule 5123']
        })
    },
    {
        'instruction': 'Analyze the following document and determine the investor jurisdiction and classification.',
        'input': 'Certificate of Incorporation from Companies House. Company number: 12345678. Registered office: 100 Bishopsgate, London EC2N 4AG. Latest accounts show net assets of £15,000,000.',
        'output': json.dumps({
            'jurisdiction': 'UK',
            'entity_type': 'corporation',
            'classification': 'professional',
            'reasoning': 'UK company (Companies House registration). Qualifies as per se professional client under MiFID II - large undertaking with net assets exceeding €2M threshold.',
            'applicable_regulations': ['FCA COBS 3.5', 'MiFID II Annex II']
        })
    },
    {
        'instruction': 'Analyze the following document and determine the investor jurisdiction and classification.',
        'input': 'ACRA Business Profile. UEN: 202312345K. Entity name: ABC Capital Pte Ltd. Registered address: 1 Raffles Place, Singapore 048616. Paid-up capital: S$15,000,000.',
        'output': json.dumps({
            'jurisdiction': 'SG',
            'entity_type': 'corporation',
            'classification': 'accredited',
            'reasoning': 'Singapore company (ACRA registration). Qualifies as accredited investor under SFA Section 4A(1)(a) - corporation with net assets exceeding S$10M.',
            'applicable_regulations': ['SFA Section 4A', 'SFA Section 275']
        })
    }
]

print(f'Training samples: {len(training_data)}')

In [None]:
# Format for instruction tuning
def format_prompt(sample):
    return f"""### Instruction:
{sample['instruction']}

### Input:
{sample['input']}

### Response:
{sample['output']}"""

# Tokenize
def tokenize(sample):
    prompt = format_prompt(sample)
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=2048,
        padding='max_length'
    )
    result['labels'] = result['input_ids'].copy()
    return result

# Create dataset
dataset = Dataset.from_list(training_data)
tokenized_dataset = dataset.map(tokenize)

print(f'Dataset prepared: {len(tokenized_dataset)} samples')

## 4. Training Configuration

In [None]:
training_args = TrainingArguments(
    output_dir='../models/jurisdiction-classifier/checkpoints',
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    logging_steps=10,
    save_strategy='epoch',
    fp16=True,
    report_to='tensorboard'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

print('Trainer configured')

In [None]:
# Start training (uncomment to run)
# trainer.train()

print('Training would start here - uncomment trainer.train() to execute')

## 5. Save Model

In [None]:
# Save fine-tuned model (uncomment after training)
# OUTPUT_DIR = '../models/jurisdiction-classifier/final'
# trainer.save_model(OUTPUT_DIR)
# tokenizer.save_pretrained(OUTPUT_DIR)
# print(f'Model saved to {OUTPUT_DIR}')

## 6. Test Inference

In [None]:
# Test the model with a new sample
test_input = """### Instruction:
Analyze the following document and determine the investor jurisdiction and classification.

### Input:
Cayman Islands Certificate of Incorporation. Company number: MC-12345. Registered office: PO Box 309, George Town, Grand Cayman. Fund documents indicate this is a private investment fund with $50M AUM.

### Response:"""

inputs = tokenizer(test_input, return_tensors='pt').to(model.device)

# Generate response
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        temperature=0.1,
        do_sample=False
    )

response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)

## Next Steps

1. Expand training data to 1000+ samples
2. Add conflict resolution training
3. Add document generation training
4. Evaluate on benchmark test cases
5. Deploy to inference API