# Privacy-Preserving Synthetic Data Generator

This notebook demonstrates how to use the Privacy-Preserving Synthetic Data Generator to create synthetic tabular data with privacy guarantees using CTGAN and Differential Privacy.

## 1. Import Dependencies

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json
import time
from datetime import datetime

# Import custom modules
from data_preprocess import DataPreprocessor
from model_train import PrivacyPreservingCTGAN
from evaluate import SyntheticDataEvaluator

# Set plot style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

## 2. Load and Preprocess Data

Load a CSV dataset and preprocess it for synthetic data generation.

In [None]:
# Load data
# Replace with your dataset path
data_path = 'your_dataset.csv'  # Update this path

# For demonstration, you can use a sample dataset like this:
# from sklearn.datasets import fetch_california_housing
# housing = fetch_california_housing()
# df = pd.DataFrame(housing.data, columns=housing.feature_names)
# df['target'] = housing.target

# Load your CSV file
try:
    df = pd.read_csv(data_path)
    print(f'Dataset loaded successfully! Shape: {df.shape}')
except Exception as e:
    print(f'Error loading dataset: {e}')
    # Create a sample dataset for demonstration
    print('Creating a sample dataset for demonstration...')
    from sklearn.datasets import fetch_california_housing
    housing = fetch_california_housing()
    df = pd.DataFrame(housing.data, columns=housing.feature_names)
    df['target'] = housing.target
    print(f'Sample dataset created. Shape: {df.shape}')

# Display data preview
df.head()

In [None]:
# Initialize preprocessor and detect column types
preprocessor = DataPreprocessor()
column_types = preprocessor.detect_column_types(df)

# Display detected column types
pd.DataFrame({
    'Column': list(column_types.keys()),
    'Type': list(column_types.values())
})


In [None]:
# Preprocess data
processed_df = preprocessor.fit_transform(df, column_types)

# Display processed data preview
print(f'Processed data shape: {processed_df.shape}')
processed_df.head()

## 3. Train Synthetic Data Model

Train a CTGAN model with optional Differential Privacy.

In [None]:
# Configure model parameters
use_dp = True  # Set to False to disable Differential Privacy
epsilon = 3.0  # Privacy budget (lower = more private)
delta = 1e-5  # Privacy failure probability
epochs = 300  # Training epochs
batch_size = 500  # Batch size

# Get discrete columns for CTGAN
discrete_columns = [col for col in processed_df.columns 
                   if any(col.startswith(f"{c}_") for c in preprocessor.categorical_columns)]

# Initialize model
model = PrivacyPreservingCTGAN(
    use_dp=use_dp,
    epsilon=epsilon,
    delta=delta,
    epochs=epochs,
    batch_size=batch_size,
    verbose=True
)

# Train model
print(f'Training model with {"Differential Privacy" if use_dp else "no privacy guarantees"}...')
training_metrics = model.train(processed_df, discrete_columns)

# Display training metrics
print(f'Training completed in {training_metrics["training_time"]:.2f} seconds')
print(f'Epochs: {training_metrics["epochs"]}')

if use_dp:
    print(f'Final privacy guarantee: (ε={training_metrics["epsilon"]:.2f}, δ={training_metrics["delta"]:.1e})')

## 4. Generate Synthetic Data

Generate synthetic data using the trained model.

In [None]:
# Number of synthetic rows to generate
num_rows = min(5000, len(df))

# Generate synthetic data
print(f'Generating {num_rows} synthetic rows...')
synthetic_processed = model.generate(num_rows)

# Inverse transform to original format
synthetic_data = preprocessor.inverse_transform(synthetic_processed)

print(f'Generated {len(synthetic_data)} synthetic rows successfully!')

# Display synthetic data preview
synthetic_data.head()

In [None]:
# Save synthetic data
os.makedirs('output', exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_path = f"output/{timestamp}_synthetic_data.csv"

synthetic_data.to_csv(output_path, index=False)
print(f'Synthetic data saved to: {output_path}')

## 5. Evaluate Synthetic Data

Evaluate the quality and privacy of the generated synthetic data.

In [None]:
# Initialize evaluator
evaluator = SyntheticDataEvaluator()

# Set column types
evaluator.set_column_types(column_types)

# Get privacy metrics if available
privacy_metrics = None
if use_dp:
    privacy_metrics = {
        'epsilon': training_metrics['epsilon'],
        'delta': training_metrics['delta']
    }

# Select a target column for utility evaluation (if applicable)
target_column = 'target' if 'target' in df.columns else None

# Run evaluation
print('Running evaluation...')
results = evaluator.evaluate_all(
    real_data=df,
    synthetic_data=synthetic_data,
    privacy_metrics=privacy_metrics,
    target_column=target_column
)

print('Evaluation completed successfully!')

In [None]:
# Display fidelity metrics
print('\nFidelity Metrics:')
fidelity = results['fidelity']

if 'overall_fidelity' in fidelity and fidelity['overall_fidelity'] is not None:
    print(f"Overall Fidelity: {fidelity['overall_fidelity']:.4f}")

if 'ks_mean' in fidelity and fidelity['ks_mean'] is not None:
    print(f"KS Similarity (Mean): {fidelity['ks_mean']:.4f}")

if 'correlation_similarity' in fidelity and fidelity['correlation_similarity'] is not None:
    print(f"Correlation Similarity: {fidelity['correlation_similarity']:.4f}")

if 'chi2_mean' in fidelity and fidelity['chi2_mean'] is not None:
    print(f"Chi-Square Similarity (Mean): {fidelity['chi2_mean']:.4f}")

if 'pmse_similarity' in fidelity and fidelity['pmse_similarity'] is not None:
    print(f"Propensity Similarity: {fidelity['pmse_similarity']:.4f}")

if 'real_vs_synthetic_auc' in fidelity and fidelity['real_vs_synthetic_auc'] is not None:
    print(f"Real vs Synthetic AUC: {fidelity['real_vs_synthetic_auc']:.4f}")

In [None]:
# Display privacy metrics
print('\nPrivacy Metrics:')
privacy = results['privacy']

if 'privacy_score' in privacy and privacy['privacy_score'] is not None:
    print(f"Privacy Score: {privacy['privacy_score']:.4f}")

if 'membership_inference_auc' in privacy and privacy['membership_inference_auc'] is not None:
    print(f"Membership Inference AUC: {privacy['membership_inference_auc']:.4f}")

if 'epsilon' in privacy and privacy['epsilon'] is not None:
    print(f"Differential Privacy ε: {privacy['epsilon']:.2f}")
    if 'delta' in privacy and privacy['delta'] is not None:
        print(f"Differential Privacy δ: {privacy['delta']:.1e}")

In [None]:
# Display utility metrics if available
if 'utility' in results:
    print('\nUtility Metrics:')
    utility = results['utility']
    
    if 'utility_ratio' in utility and utility['utility_ratio'] is not None:
        print(f"Utility Ratio: {utility['utility_ratio']:.4f}")
    
    if 'real_model_accuracy' in utility and utility['real_model_accuracy'] is not None:
        print(f"Real Model Accuracy: {utility['real_model_accuracy']:.4f}")
    
    if 'synthetic_model_accuracy' in utility and utility['synthetic_model_accuracy'] is not None:
        print(f"Synthetic Model Accuracy: {utility['synthetic_model_accuracy']:.4f}")

## 6. Visualize Comparisons

Visualize comparisons between real and synthetic data.

In [None]:
# Plot distributions for selected columns
# Get numerical columns
numerical_cols = [col for col in df.columns if pd.api.types.is_numeric_dtype(df[col])]

# Select a subset of columns for visualization
selected_columns = numerical_cols[:min(5, len(numerical_cols))]

# Create distribution plots
fig = evaluator.plot_distributions(
    real_data=df,
    synthetic_data=synthetic_data,
    columns=selected_columns,
    figsize=(15, 10)
)

plt.suptitle('Distribution Comparison: Real vs Synthetic', fontsize=16)
plt.tight_layout()
plt.subplots_adjust(top=0.95)
plt.show()

In [None]:
# Plot correlation heatmaps
fig = evaluator.plot_correlation_comparison(
    real_data=df,
    synthetic_data=synthetic_data,
    figsize=(15, 7)
)

plt.suptitle('Correlation Comparison: Real vs Synthetic', fontsize=16)
plt.tight_layout()
plt.subplots_adjust(top=0.90)
plt.show()

## 7. Save Evaluation Results

Save the evaluation results to a JSON file.

In [None]:
# Save evaluation results
os.makedirs('output', exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
results_path = f"output/{timestamp}_evaluation_results.json"

# Convert numpy values to Python native types for JSON serialization
def convert_to_serializable(obj):
    if isinstance(obj, (np.int_, np.intc, np.intp, np.int8, np.int16, np.int32, np.int64, 
                       np.uint8, np.uint16, np.uint32, np.uint64)):
        return int(obj)
    elif isinstance(obj, (np.float_, np.float16, np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, (np.ndarray,)):
        return obj.tolist()
    elif isinstance(obj, dict):
        return {k: convert_to_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(i) for i in obj]
    else:
        return obj

serializable_results = convert_to_serializable(results)

with open(results_path, 'w') as f:
    json.dump(serializable_results, f, indent=2)

print(f'Evaluation results saved to: {results_path}')

## 8. Conclusion

This notebook demonstrated how to use the Privacy-Preserving Synthetic Data Generator to create synthetic tabular data with privacy guarantees. The system allows for:

1. Loading and preprocessing tabular data
2. Training a CTGAN model with optional Differential Privacy
3. Generating synthetic data with privacy guarantees
4. Evaluating the fidelity, privacy, and utility of the synthetic data
5. Visualizing comparisons between real and synthetic data

The privacy-utility trade-off can be controlled by adjusting the privacy budget (ε) and other parameters.

For a more interactive experience, you can use the Streamlit application by running:
```
streamlit run app.py
```