# 📊 Data Preprocessing for Insurance Fine-tuning

This notebook handles the complete data preprocessing pipeline for insurance documents:

## What this notebook does:
1. Load and inspect raw insurance documents
2. Remove PII (Personal Identifiable Information)
3. Clean and standardize text format
4. Create task-specific datasets (classification, QA, summarization)
5. Split data into train/validation/test sets
6. Validate data quality and save processed datasets

**⚠️ Important: Always verify PII removal before proceeding to training**

## 1. Import Libraries and Setup

In [None]:
import os
import re
import json
import pandas as pd
import numpy as np
from pathlib import Path
from typing import List, Dict, Tuple
import warnings
from tqdm.auto import tqdm
from datetime import datetime
import hashlib

# Data processing
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

# Text processing
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

warnings.filterwarnings('ignore')
tqdm.pandas()

print("✅ Libraries imported successfully")
print(f"Working directory: {os.getcwd()}")

## 2. Configuration and Constants

In [None]:
# Directory paths
RAW_DATA_DIR = Path("data/raw")
PROCESSED_DATA_DIR = Path("data/processed")
ANNOTATIONS_DIR = Path("data/annotations")

# Create directories if they don't exist
PROCESSED_DATA_DIR.mkdir(exist_ok=True)
ANNOTATIONS_DIR.mkdir(exist_ok=True)

# Data split ratios
TRAIN_RATIO = 0.7
VAL_RATIO = 0.15
TEST_RATIO = 0.15

# Insurance task types
TASK_TYPES = {
    'CLAIM_CLASSIFICATION': 'Categorize insurance claims',
    'POLICY_SUMMARIZATION': 'Summarize policy documents',
    'FAQ_GENERATION': 'Generate FAQs from policies',
    'COMPLIANCE_CHECK': 'Identify compliance requirements',
    'CONTRACT_QA': 'Answer questions about contracts'
}

# Text processing parameters
MIN_TEXT_LENGTH = 50  # Minimum character length for valid documents
MAX_TEXT_LENGTH = 8192  # Maximum character length for model context
MAX_SUMMARY_LENGTH = 512  # Maximum summary length

print(f"Configuration loaded:")
print(f"- Raw data directory: {RAW_DATA_DIR}")
print(f"- Processed data directory: {PROCESSED_DATA_DIR}")
print(f"- Task types: {list(TASK_TYPES.keys())}")

## 3. PII Detection and Removal

In [None]:
class PIIRemover:
    """Class to handle PII detection and removal from insurance documents"""
    
    def __init__(self):
        # Regex patterns for common PII
        self.patterns = {
            'ssn': r'\\b\\d{3}-\\d{2}-\\d{4}\\b|\\b\\d{9}\\b',
            'phone': r'\\b(?:\\+?1[-.]?)?\\(?([0-9]{3})\\)?[-.]?([0-9]{3})[-.]?([0-9]{4})\\b',
            'email': r'\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b',
            'zip_code': r'\\b\\d{5}(?:-\\d{4})?\\b',
            'credit_card': r'\\b(?:\\d{4}[-\\s]?){3}\\d{4}\\b',
            'account_number': r'\\b(?:account|acct|policy)\\s*#?\\s*\\d{6,}\\b',
            'date_of_birth': r'\\b(?:0?[1-9]|1[0-2])[/-](?:0?[1-9]|[12]\\d|3[01])[/-](?:19|20)\\d{2}\\b',
            'address_number': r'\\b\\d{1,5}\\s+[A-Za-z\\s]+(?:Street|St|Avenue|Ave|Road|Rd|Drive|Dr|Lane|Ln|Boulevard|Blvd)\\b',
        }
    
    def detect_pii(self, text: str) -> Dict[str, List[str]]:
        """Detect PII in text and return findings"""
        findings = {}
        
        for pii_type, pattern in self.patterns.items():
            matches = re.findall(pattern, text, re.IGNORECASE)
            if matches:
                findings[pii_type] = matches
        
        return findings
    
    def remove_pii(self, text: str) -> Tuple[str, Dict[str, int]]:
        """Remove PII from text and return cleaned text with removal stats"""
        cleaned_text = text
        removal_stats = {}
        
        # Replace PII with generic placeholders
        replacements = {
            'ssn': '[SSN]',
            'phone': '[PHONE]',
            'email': '[EMAIL]',
            'zip_code': '[ZIP]',
            'credit_card': '[CARD_NUMBER]',
            'account_number': '[ACCOUNT_NUMBER]',
            'date_of_birth': '[DATE_OF_BIRTH]',
            'address_number': '[ADDRESS]',
        }
        
        for pii_type, replacement in replacements.items():
            pattern = self.patterns[pii_type]
            matches = re.findall(pattern, cleaned_text, re.IGNORECASE)
            removal_stats[pii_type] = len(matches)
            cleaned_text = re.sub(pattern, replacement, cleaned_text, flags=re.IGNORECASE)
        
        return cleaned_text, removal_stats

# Test PII removal
pii_remover = PIIRemover()
sample_text = "John Smith's SSN is 123-45-6789 and phone is (555) 123-4567."
cleaned, stats = pii_remover.remove_pii(sample_text)
print(f"Original: {sample_text}")
print(f"Cleaned: {cleaned}")
print(f"Stats: {stats}")

## 4. Create Sample Insurance Data

In [None]:
def create_sample_insurance_data():
    """Create sample insurance documents for testing"""
    
    sample_documents = [
        {
            'id': 'health_policy_001',
            'source': 'sample_data',
            'content': '''Health Insurance Policy - Premium Coverage
            
Coverage: This comprehensive health insurance policy provides coverage for medical expenses including hospital stays, doctor visits, prescription medications, and emergency care. The annual coverage limit is $1,000,000 per insured individual.
            
Deductible: Annual deductible of $1,500 per individual, $3,000 per family. After meeting the deductible, the plan covers 80% of eligible medical expenses.
            
Exclusions: Pre-existing conditions diagnosed within 12 months prior to policy effective date, cosmetic procedures, experimental treatments, and services not deemed medically necessary are excluded from coverage.
            
Premium: Monthly premium of $450 for individual coverage, $1,200 for family coverage. Premiums are due on the first of each month.''',
            'type': 'health_policy',
            'task_type': 'POLICY_SUMMARIZATION'
        },
        {
            'id': 'auto_claim_001',
            'source': 'sample_data',
            'content': '''Auto Insurance Claim - Vehicle Collision
            
Claim Details: Vehicle collision occurred on Highway 101 involving two vehicles. Insured vehicle sustained front-end damage requiring repair. No injuries reported. Police report filed, case number 2024-001234.
            
Coverage Applied: Collision coverage with $500 deductible. Estimated repair cost $3,200. Coverage approved for $2,700 after deductible.
            
Settlement: Claim approved and processed. Payment issued to approved repair facility. Rental car coverage provided for 5 days during repair period.''',
            'type': 'auto_claim',
            'task_type': 'CLAIM_CLASSIFICATION'
        },
        {
            'id': 'compliance_doc_001',
            'source': 'sample_data',
            'content': '''Insurance Regulatory Compliance Requirements
            
HIPAA Compliance: All health insurance operations must comply with Health Insurance Portability and Accountability Act requirements for protecting patient health information privacy and security.
            
State Regulations: Insurance products must be filed with and approved by state insurance commissioners before sale. Rate changes require regulatory approval.
            
Consumer Protection: All marketing materials must be clear, truthful, and not misleading. Claims processing must be fair and timely according to state prompt payment laws.''',
            'type': 'compliance',
            'task_type': 'COMPLIANCE_CHECK'
        }
    ]
    
    # Save sample data to files
    RAW_DATA_DIR.mkdir(exist_ok=True)
    sample_file = RAW_DATA_DIR / 'sample_insurance_docs.json'
    
    with open(sample_file, 'w', encoding='utf-8') as f:
        json.dump(sample_documents, f, indent=2, ensure_ascii=False)
    
    print(f"✅ Created sample data file: {sample_file}")
    return sample_documents

# Create sample data
sample_docs = create_sample_insurance_data()
print(f"Created {len(sample_docs)} sample documents")
for doc in sample_docs:
    print(f"- {doc['id']}: {doc['type']} ({doc['task_type']})")

## 5. Process Data and Create Datasets

In [None]:
def process_and_create_datasets(documents):
    """Process documents and create task-specific datasets"""
    
    processed_examples = []
    
    for doc in documents:
        # Clean text and remove PII
        content = doc['content']
        cleaned_content, pii_stats = pii_remover.remove_pii(content)
        
        task_type = doc['task_type']
        
        if task_type == 'POLICY_SUMMARIZATION':
            # Create summarization example
            summary = f"This {doc['type']} document covers key insurance terms including coverage details, deductibles, and important policy information."
            
            example = {
                'instruction': 'Summarize the following insurance policy document.',
                'input': cleaned_content,
                'output': summary,
                'task_type': task_type,
                'doc_id': doc['id']
            }
            processed_examples.append(example)
        
        elif task_type == 'CLAIM_CLASSIFICATION':
            # Create classification example
            example = {
                'instruction': 'Classify this insurance claim into the appropriate category.',
                'input': cleaned_content,
                'output': f"This is a {doc['type']} claim.",
                'task_type': task_type,
                'doc_id': doc['id']
            }
            processed_examples.append(example)
        
        elif task_type == 'COMPLIANCE_CHECK':
            # Create compliance checking example
            example = {
                'instruction': 'Identify compliance requirements in this insurance document.',
                'input': cleaned_content,
                'output': 'Key compliance requirements include HIPAA privacy protections, state regulatory approvals, and consumer protection standards.',
                'task_type': task_type,
                'doc_id': doc['id']
            }
            processed_examples.append(example)
    
    return processed_examples

def create_train_test_splits(examples):
    """Create train/validation/test splits"""
    
    if len(examples) < 3:
        # Too few examples, use all for training
        return {
            'train': examples,
            'validation': examples[:1] if examples else [],
            'test': examples[:1] if examples else []
        }
    
    # Split the data
    train_examples, temp_examples = train_test_split(
        examples, test_size=0.3, random_state=42
    )
    
    val_examples, test_examples = train_test_split(
        temp_examples, test_size=0.5, random_state=42
    )
    
    return {
        'train': train_examples,
        'validation': val_examples,
        'test': test_examples
    }

def save_datasets(data_splits):
    """Save processed datasets"""
    
    # Create combined dataset
    combined_dir = PROCESSED_DATA_DIR / 'combined'
    combined_dir.mkdir(exist_ok=True)
    
    for split_name, examples in data_splits.items():
        # Save as JSON
        json_file = combined_dir / f"{split_name}.json"
        with open(json_file, 'w', encoding='utf-8') as f:
            json.dump(examples, f, indent=2, ensure_ascii=False)
        
        # Save as HuggingFace dataset
        if examples:  # Only create dataset if we have examples
            dataset = Dataset.from_list(examples)
            hf_dir = combined_dir / f"{split_name}_hf"
            dataset.save_to_disk(hf_dir)
        
        print(f"✅ Saved {split_name}: {len(examples)} examples")
    
    # Save processing metadata
    metadata = {
        'processing_date': datetime.now().isoformat(),
        'total_examples': sum(len(examples) for examples in data_splits.values()),
        'splits': {split: len(examples) for split, examples in data_splits.items()},
        'task_types': list(TASK_TYPES.keys())
    }
    
    metadata_file = combined_dir / 'metadata.json'
    with open(metadata_file, 'w', encoding='utf-8') as f:
        json.dump(metadata, f, indent=2, ensure_ascii=False)
    
    print(f"✅ Saved metadata: {metadata_file}")

# Process documents and create datasets
print("Processing documents and creating datasets...")
processed_examples = process_and_create_datasets(sample_docs)

print(f"\nCreated {len(processed_examples)} training examples:")
for example in processed_examples:
    print(f"- {example['doc_id']}: {example['task_type']}")

# Create train/test splits
data_splits = create_train_test_splits(processed_examples)

print(f"\nData splits created:")
for split_name, examples in data_splits.items():
    print(f"- {split_name}: {len(examples)} examples")

# Save datasets
save_datasets(data_splits)

print(f"\n✅ Data preprocessing complete!")
print(f"Processed data saved to: {PROCESSED_DATA_DIR}")
print(f"\nNext steps:")
print(f"1. Review the processed datasets")
print(f"2. Run 02_tokenization.ipynb to prepare data for training")
print(f"3. Proceed to 03_finetuning_lora.ipynb for model training")