# Synthetic + PII-Mixed Data Generation
This notebook demonstrates generating and inspecting synthetic patient records with mixed PII and non-PII content.

In [20]:
import os
import subprocess
import json
from pathlib import Path

os.chdir('/Users/cep4u/red_teaming_challenge_openai_oss_20b')  # Ensure working directory is project root

# Run the updated local generator (multi-format, multi-scenario)
subprocess.run(["python", "scripts/generate_local.py"])
print('Local synthetic data generated.')

# Preview bundles and multi-format outputs
with open('data/items.jsonl') as f:
    records = [json.loads(line) for line in f]

# Show first bundle's files and metadata
first_bundle = records[0]['bundle_id']
bundle_files = [r for r in records if r['bundle_id'] == first_bundle]
for rec in bundle_files:
    print(f"{rec['file_type']}: {rec['file_path']}")
    print(f"Scenario: {rec['scenario']}")
    print(f"PII types: {[span['type'] for span in rec['pii_spans']]}")
    print(f"Decoy spans: {rec['decoy_spans']}")
    print(f"Blocks meta: {rec['blocks_meta']}")
    print('-'*40)

Wrote 2700 rows to /Users/cep4u/red_teaming_challenge_openai_oss_20b/data/items.jsonl. File types: {'note': 300, 'csv_labs': 300, 'csv_meds': 300, 'csv_vitals': 300, 'hl7': 300, 'fhir': 300, 'cda': 300, 'email': 300, 'ics': 300} Bundles: 60
Local synthetic data generated.
note: /Users/cep4u/red_teaming_challenge_openai_oss_20b/data/bundles/bundle_0000/note_00.md
Scenario: control
PII types: ['name', 'dob', 'address', 'email', 'phone', 'mrn', 'ssn']
Decoy spans: [{'label': 'DECOY_SSN_PATTERN', 'start': 198, 'end': 216}]
Blocks meta: ['progress']
----------------------------------------
csv_labs: /Users/cep4u/red_teaming_challenge_openai_oss_20b/data/bundles/bundle_0000/lab_results.csv
Scenario: control
PII types: ['name', 'mrn']
Decoy spans: [{'label': 'DECOY_SSN_PATTERN', 'start': 102, 'end': 120}]
Blocks meta: ['csv_labs']
----------------------------------------
csv_meds: /Users/cep4u/red_teaming_challenge_openai_oss_20b/data/bundles/bundle_0000/medications.csv
Scenario: control
PII 

In [None]:
import json
with open('data/items.jsonl') as f:
    records = [json.loads(line) for line in f]
print(records[:2])  # Preview first 2 records

# Validate schema and fields in generated local synthetic data
for rec in records[:3]:
    assert 'pii_spans' in rec, 'PII spans missing'
    assert 'variant' in rec, 'Variant missing'
    assert 'text' in rec, 'Text missing'
    assert 'meta' in rec, 'Meta missing'
    pii_types = {span['type'] for span in rec['pii_spans']}
    assert len(pii_types & {'NAME','DOB','EMAIL','PHONE','ADDRESS','MRN','SSN'}) > 0, 'Missing PII types'
    assert 'decoy_spans' in rec, 'Decoy spans missing'
    assert 'blocks_meta' in rec, 'Blocks meta missing'
print('Schema and fields validated for first 3 records.')