# RWA Compliance AI - Data Exploration

This notebook explores the regulatory data corpus for training compliance AI models.

## Contents
1. Load and inspect jurisdiction rules
2. Analyze investor classification distributions
3. Identify common conflict patterns
4. Prepare training data samples

In [None]:
import json
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

# Set paths
DATA_DIR = Path('../data/jurisdictions')
print(f'Data directory: {DATA_DIR}')

## 1. Load Jurisdiction Rules

In [None]:
# Load all jurisdiction files
jurisdictions = {}

for file in DATA_DIR.glob('*.json'):
    with open(file, 'r') as f:
        data = json.load(f)
        jurisdictions[data.get('jurisdiction', file.stem)] = data

print(f'Loaded {len(jurisdictions)} jurisdictions: {list(jurisdictions.keys())}')

In [None]:
# Inspect US SEC rules structure
if 'US' in jurisdictions:
    us_rules = jurisdictions['US']
    print('US SEC Rules Structure:')
    print(json.dumps(list(us_rules.keys()), indent=2))

## 2. Accreditation Thresholds Comparison

In [None]:
# Compare accreditation thresholds across jurisdictions
thresholds = []

# US
if 'US' in jurisdictions:
    us_acc = jurisdictions['US'].get('accredited_investor_definition', {})
    thresholds.append({
        'Jurisdiction': 'US',
        'Net Worth (USD)': 1000000,
        'Income (USD)': 200000,
        'Type': 'Accredited Investor'
    })

# EU
if 'EU' in jurisdictions:
    thresholds.append({
        'Jurisdiction': 'EU',
        'Net Worth (USD)': 550000,  # ~€500K converted
        'Income (USD)': None,
        'Type': 'Professional Client'
    })

# Singapore
if 'SG' in jurisdictions:
    thresholds.append({
        'Jurisdiction': 'SG',
        'Net Worth (USD)': 1500000,  # ~S$2M converted
        'Income (USD)': 225000,  # ~S$300K converted
        'Type': 'Accredited Investor'
    })

df_thresholds = pd.DataFrame(thresholds)
print(df_thresholds)

In [None]:
# Visualize thresholds
fig, ax = plt.subplots(figsize=(10, 6))
df_thresholds.plot(x='Jurisdiction', y='Net Worth (USD)', kind='bar', ax=ax)
plt.title('Accreditation Net Worth Thresholds by Jurisdiction')
plt.ylabel('USD')
plt.tight_layout()
plt.show()

## 3. Identify Conflict Patterns

In [None]:
# Define common conflict patterns for training data
conflict_patterns = [
    {
        'type': 'accreditation_threshold',
        'jurisdictions': ['US', 'EU'],
        'description': 'US requires $1M net worth, EU requires €500K portfolio',
        'resolution': 'apply_strictest',
        'resolved_rule': 'Require $1M net worth AND €500K portfolio'
    },
    {
        'type': 'lockup_period',
        'jurisdictions': ['US', 'SG'],
        'description': 'US Reg D has no lockup, SG Section 275 has 6-month restriction',
        'resolution': 'apply_strictest',
        'resolved_rule': 'Apply 6-month lockup for all'
    },
    {
        'type': 'investor_cap',
        'jurisdictions': ['US', 'EU'],
        'description': 'US Reg D allows 2000 investors, EU has no specific cap',
        'resolution': 'apply_strictest',
        'resolved_rule': 'Cap at 2000 investors'
    }
]

df_conflicts = pd.DataFrame(conflict_patterns)
print(df_conflicts[['type', 'jurisdictions', 'resolution']])

## 4. Training Data Sample Generation

In [None]:
# Generate sample training data for jurisdiction classifier
training_samples = [
    {
        'input': 'W-9 form, SSN ending 5678, address: 123 Main St, New York, NY 10001',
        'output': {
            'jurisdiction': 'US',
            'entity_type': 'individual',
            'classification': 'pending_verification'
        }
    },
    {
        'input': 'Certificate of Incorporation, Companies House number 12345678, registered office: London, UK',
        'output': {
            'jurisdiction': 'UK',
            'entity_type': 'corporation',
            'classification': 'pending_verification'
        }
    },
    {
        'input': 'ACRA BizFile, UEN 202312345K, registered address: Singapore 048623',
        'output': {
            'jurisdiction': 'SG',
            'entity_type': 'corporation',
            'classification': 'pending_verification'
        }
    }
]

print(f'Generated {len(training_samples)} training samples')
print(json.dumps(training_samples[0], indent=2))

## Next Steps

1. Expand jurisdiction rules with more detail
2. Generate larger training dataset (1000+ samples per task)
3. Add edge cases and multi-jurisdiction scenarios
4. Proceed to model fine-tuning notebook