In [None]:
import numpy as np
import pandas as pd

def generate_synthetic_pca_dataset(
    cleaned_file_path: str,
    synthetic_file_path: str,
    n_samples: int = None,
    random_state: int = 42,
):
    print('-0')
    # 1) load your cleaned data
    df = pd.read_csv(cleaned_file_path)
    print('read1')
    # if no custom size, match the original
    if n_samples is None:
        n_samples = len(df)
    
    # reproducibility
    np.random.seed(random_state)
    
    # 2) identify PCA columns and get their stats
    pca_cols = [c for c in df.columns if c.startswith("V")]
    stats = df[pca_cols].agg(["mean", "std", "min", "max"]).T
    print('2')
    # 3) build synthetic PCA features
    synthetic = pd.DataFrame()
    for col in pca_cols:
        print('col', col)
        mu, sigma, lo, hi = stats.loc[col, ["mean", "std", "min", "max"]]
        vals = np.random.normal(loc=mu, scale=sigma, size=n_samples)
        synthetic[col] = np.clip(vals, lo, hi)
    
    # 4) sample Time & Amount from the original empirical distribution
    synthetic["Time"]   = np.random.choice(df["Time"].values,   size=n_samples, replace=True)
    synthetic["Amount"] = np.random.choice(df["Amount"].values, size=n_samples, replace=True)
    print('4')
    # 5) simulate Class at the original fraud rate
    fraud_rate = df["Class"].mean()
    synthetic["Class"] = (np.random.rand(n_samples) < fraud_rate).astype(int)
    
    # 6) reorder columns and save
    cols = ["Time"] + pca_cols + ["Amount", "Class"]
    synthetic[cols].to_csv(synthetic_file_path, index=False)
    print(f"✅ Synthetic dataset with {n_samples:,} rows written to {synthetic_file_path}")
print('hello....)
generate_synthetic_pca_dataset(
    cleaned_file_path="/mnt/data/Fraud-Detection-Workshop/cleaned_cc_transaction_rolling_2d_window.csv",
    synthetic_file_path="/mnt/data/Fraud-Detection-Workshop/synthetic_pca_dataset.csv",
    # n_samples=100000,  # optionally override
)
