In [None]:
# Synthetic Data Generator & Validator

from sdv.tabular import CTGAN
import pandas as pd

# 1. Load real data (substitute correct path)
real_data = pd.read_csv("data/synthetic/generated-datasets/prod.csv")
print(real_data.head())

In [None]:
# 2. Train CTGAN
model = CTGAN()
model.fit(real_data)

In [None]:
# 3. Generate synthetic data
synthetic_data = model.sample(len(real_data))
print(synthetic_data.head())

In [None]:
# 4. Statistical validation
from scipy.stats import ks_2samp

for col in real_data.select_dtypes(include='number'):
    stat, pval = ks_2samp(real_data[col], synthetic_data[col])
    print(f'Column: {col}, KS-test p-value: {pval}')

In [None]:
# 5. Optionally, NER for PII (use spaCy)
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(str(real_data.sample(1)))

pii = [ent for ent in doc.ents if ent.label_ in ('PERSON', 'ORG', 'GPE', 'EMAIL', 'CARDINAL')]
print("PII Entities:", pii)