In [None]:
# Synthetic Data Scaling with SDV (CTGAN / WGAN-GP)
'''
This notebook trains an SDV CTGAN (WGAN-GP architecture) model on a cleaned
transaction-level dataset to generate synthetic data for analytical stress-testing.

Execution Environment:
- Google Colab (Free Tier)
- GPU-enabled runtime (CUDA preferred)

Prerequisites (run before anything else):
1. Install required libraries:
   `!pip install sdv`
2. Mount Google Drive to access input data and save trained models.
   `from google.colab import drive`
   `drive.mount('/content/drive')`

Notes:
- This notebook is not a production ML pipeline.
- The model is used strictly for synthetic data generation, not prediction.
- Personal identifiers were intentionally excluded to preserve stability and focus on aggregate patterns.
'''

In [None]:
### Important
'''
Additional Usage Note:
- File paths and filenames can be changed freely.
  Ensure naming and path consistency is preserved across training,
  generation, and downstream processing.
'''

In [None]:
# Install SDV (required for Colab)
!pip install sdv

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import torch
import pandas as pd
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata
import gc

# ------------------------------------------------------------------------------
# 1. HARDWARE CHECK
# ------------------------------------------------------------------------------
# Prefer GPU execution when available (WGAN-GP benefits significantly from CUDA)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Hardware detected: training will run on {device.upper()}")

# Free memory before training
gc.collect()
torch.cuda.empty_cache()

# ------------------------------------------------------------------------------
# 2. LOAD DATA
# ------------------------------------------------------------------------------
print("Loading cleaned transaction data...")
df_trans = pd.read_csv(
    '/content/drive/MyDrive/DataCo_Synthetic/fixed_columns_final.csv'
)

# ------------------------------------------------------------------------------
# 3. METADATA DEFINITION
# ------------------------------------------------------------------------------
# Automatically infer column types from the dataframe
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(df_trans)

# Manual overrides to preserve analytical meaning and stability
# - Prevent SDV from treating state fields as PII
# - Enforce date components as categorical to avoid fractional values
metadata.update_column(column_name='customer_state', sdtype='categorical')
metadata.update_column(column_name='order_state', sdtype='categorical')

metadata.update_column(column_name='order_year', sdtype='categorical')
metadata.update_column(column_name='order_month', sdtype='categorical')
metadata.update_column(column_name='order_day', sdtype='categorical')

print("Final metadata configuration:")
print(metadata.to_dict())

# ------------------------------------------------------------------------------
# 4. CONFIGURE CTGAN (WGAN-GP)
# ------------------------------------------------------------------------------
print("Initializing CTGAN (WGAN-GP architecture)...")

synthesizer = CTGANSynthesizer(
    metadata,
    epochs=500,       # Higher epochs for improved convergence
    batch_size=500,   # Smaller batches for more stable gradient updates
    verbose=True,
    cuda=True         # Force GPU usage when available
)

# ------------------------------------------------------------------------------
# 5. TRAIN MODEL
# ------------------------------------------------------------------------------
print("Starting CTGAN training...")
synthesizer.fit(df_trans)

# ------------------------------------------------------------------------------
# 6. SAVE TRAINED MODEL
# ------------------------------------------------------------------------------
save_path = '/content/drive/MyDrive/DataCo_Synthetic/CTGAN_WGAN_ModelEpochs500.pkl'
synthesizer.save(save_path)

print(f"Model saved to: {save_path}")


In [None]:
## Synthetic Data Quality Evaluation (SDV)
'''
This section evaluates the quality of the generated synthetic dataset against
the original data using SDV's built-in diagnostics and statistical similarity checks.

Purpose:
- Validate that synthetic data preserves structural and statistical properties
- Ensure relationships are usable for analytical stress-testing
- Confirm no major distributional or constraint violations

Notes:
- Evaluation is performed on samples for efficiency
- Scores are used as sanity checks, not as ML benchmarks
- High scores indicate analytical usability, not predictive accuracy
'''

In [None]:
import pandas as pd
from sdv.metadata import SingleTableMetadata
from sdv.evaluation.single_table import evaluate_quality, run_diagnostic

# ------------------------------------------------------------------------------
# 1. LOAD REAL AND SYNTHETIC DATA (SAMPLED)
# ------------------------------------------------------------------------------
print("Loading data for quality evaluation...")

# Real data sample (sufficient for statistical comparison)
real_df = pd.read_csv(
    '/content/drive/MyDrive/DataCo_Synthetic/fixed_columns_final.csv',
    nrows=150_000
)

# Synthetic data sample generated by CTGAN
fake_df = pd.read_csv(
    '/content/drive/MyDrive/DataCo_Synthetic/DataCo_Synthetic_2M_Safe.csv',
    nrows=1_500_000
)

# ------------------------------------------------------------------------------
# 2. METADATA SETUP
# ------------------------------------------------------------------------------
# Metadata is inferred from real data to define valid structure and constraints
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(real_df)

# ------------------------------------------------------------------------------
# 3. DIAGNOSTIC CHECK
# ------------------------------------------------------------------------------
# Verifies whether the synthetic data violates basic rules or constraints
print("\nRunning diagnostic checks...")
diagnostic = run_diagnostic(
    real_data=real_df,
    synthetic_data=fake_df,
    metadata=metadata
)

# ------------------------------------------------------------------------------
# 4. STATISTICAL QUALITY EVALUATION
# ------------------------------------------------------------------------------
# Measures how closely synthetic distributions match the real data
print("\nRunning statistical quality evaluation...")
report = evaluate_quality(
    real_data=real_df,
    synthetic_data=fake_df,
    metadata=metadata
)

# ------------------------------------------------------------------------------
# 5. RESULTS SUMMARY
# ------------------------------------------------------------------------------
print("\n" + "=" * 40)
print(f"Synthetic Data Quality Score: {report.get_score() * 100:.2f}%")
print("=" * 40)

print("\nDetailed Metric Breakdown:")
print(report.get_properties())

In [None]:
## Quick Synthetic Sample Validation
'''
This section performs a lightweight quality check on a small synthetic sample
generated directly from the trained CTGAN model.

Purpose:
- Sanity-check the trained model before large-scale generation
- Verify that basic statistical structure is preserved
- Catch obvious metadata or training issues early

Notes:
- This is a fast validation step, not a full benchmark
- Results are indicative, not final
'''

In [None]:
# ------------------------------------------------------------------------------
# QUICK SYNTHETIC SAMPLE QUALITY CHECK
# ------------------------------------------------------------------------------
from sdv.evaluation.single_table import evaluate_quality, run_diagnostic

# Generate a synthetic sample directly from the trained model
print("Generating synthetic sample for validation...")
sample_500 = synthesizer.sample(num_rows=200_000)

# Run statistical quality evaluation
print("Running quality evaluation on synthetic sample...")
quality_report = evaluate_quality(
    real_data=df_trans,
    synthetic_data=sample_500,
    metadata=metadata
)

# Output overall quality score
print("\nFinal synthetic data quality score:")
print(quality_report.get_score())

# Optional diagnostic check (useful for debugging schema or constraint issues)
# diagnostic_report = run_diagnostic(
#     real_data=df_trans,
#     synthetic_data=sample_500,
#     metadata=metadata
# )
# print(diagnostic_report.get_score())

In [None]:
## Large-Scale Synthetic Data Generation with Safety Constraints
'''
This section generates a large synthetic dataset (~2M rows) from the trained
CTGAN model while enforcing real-world relational constraints.

Purpose:
- Scale the dataset for analytical stress-testing
- Prevent unrealistic or hallucinated combinations
- Preserve valid geography, product hierarchy, and customer mappings

Approach:
- Generate data in chunks to manage memory
- Reapply "truth tables" derived from real data
- Enforce valid combinations post-generation
'''

In [None]:
import pandas as pd
from sdv.single_table import CTGANSynthesizer
import gc

# ------------------------------------------------------------------------------
# CONFIGURATION
# ------------------------------------------------------------------------------
MODEL_PATH = '/content/drive/MyDrive/DataCo_Synthetic/DataCo_WGAN_Model_Epochs500.pkl'
ORIGINAL_DATA_PATH = '/content/drive/MyDrive/DataCo_Synthetic/fixed_columns_final.csv'
OUTPUT_FILENAME = '/content/drive/MyDrive/DataCo_Synthetic/DataCo_Final_2M.csv'

NUM_ROWS_TO_GENERATE = 2_000_000
CHUNK_SIZE = 100_000

# ------------------------------------------------------------------------------
# 1. LOAD MODEL AND REFERENCE DATA
# ------------------------------------------------------------------------------
print("Loading trained CTGAN model and reference dataset...")

synthesizer = CTGANSynthesizer.load(MODEL_PATH)
df_real = pd.read_csv(ORIGINAL_DATA_PATH)

# ------------------------------------------------------------------------------
# 2. BUILD REFERENCE CONSTRAINT TABLES ("TRUTH TABLES")
# ------------------------------------------------------------------------------
# These tables are used to re-enforce valid combinations after sampling

print("Building reference constraint tables...")

# Order geography constraints
valid_order_geo = df_real[
    ["order_state", "order_country", "order_region", "market"]
].drop_duplicates()

# Product hierarchy constraints
valid_products = df_real[
    ["product_name", "category_name", "department_name"]
].drop_duplicates()

# Customer geography constraints
valid_customer_geo = df_real[
    ["customer_state", "customer_country"]
].drop_duplicates()

print(f"Valid order geography combinations: {len(valid_order_geo)}")
print(f"Valid customer geography combinations: {len(valid_customer_geo)}")
print(f"Valid product combinations: {len(valid_products)}")

# ------------------------------------------------------------------------------
# 3. SYNTHETIC DATA GENERATION (CHUNKED)
# ------------------------------------------------------------------------------
total_chunks = NUM_ROWS_TO_GENERATE // CHUNK_SIZE
synthetic_batches = []

print(f"Starting synthetic generation of {NUM_ROWS_TO_GENERATE} rows...")

for i in range(total_chunks):
    print(f"Generating batch {i + 1}/{total_chunks}")

    batch = synthesizer.sample(num_rows=CHUNK_SIZE)

    # --------------------------------------------------------------------------
    # APPLY SAFETY CONSTRAINTS
    # --------------------------------------------------------------------------
    # Enforce valid order geography
    batch = batch.drop(columns=["order_country", "order_region", "market"])
    batch = batch.merge(valid_order_geo, on="order_state", how="inner")

    # Enforce valid product hierarchy
    batch = batch.drop(columns=["category_name", "department_name"])
    batch = batch.merge(valid_products, on="product_name", how="inner")

    # Enforce valid customer geography
    batch = batch.drop(columns=["customer_country"])
    batch = batch.merge(valid_customer_geo, on="customer_state", how="inner")

    synthetic_batches.append(batch)

    # Explicit memory cleanup between batches
    del batch
    gc.collect()

# ------------------------------------------------------------------------------
# 4. FINAL ASSEMBLY AND EXPORT
# ------------------------------------------------------------------------------
print("Combining generated batches...")

df_final = pd.concat(synthetic_batches, ignore_index=True)

# Optional: downsample to exact row count for determinism
df_final = df_final.sample(n=NUM_ROWS_TO_GENERATE, random_state=42)

print("Synthetic data generation complete.")
print(f"Final row count: {len(df_final)}")

print(f"Saving output to: {OUTPUT_FILENAME}")
df_final.to_csv(OUTPUT_FILENAME, index=False)

print("Generation process finished.")
