**1. CTGAN**
- strength: categorical handling, complex dependencies
- limitations: training
- suitable for: categorical and mixed data

**2. TVAE**
- strength: Fast training, both continuous and categorical support
- limitations: Weaker dependencies
- suitable for: Structured tabular data

**3. Synthpop**
- strength: Interpretable, low requirements
- limitations: Limited complexity
- suitable for: Small datasets, privacy-preserving

**4. MedGAN**
- strength: Sparse, binary data generation
- limitations: Binary data only
- suitable for: healthcare data

**5. CopulaGAN**
- strength: Captures dependencies via copulas
- limitations: Not udeal for small data
- suitable for: Dependency-sensitive data

In [10]:
import pandas as pd
file_path = '../Data/data.csv'
data = pd.read_csv(file_path)
print(data.shape)
data.head()

(10000, 8)


Unnamed: 0,customer_age_class,region,basket_package_price_disc,internet_package_price_disc,basket_days_until_contract_end,count_previous_cancellations_0_18,internet_product_name,flag_churn
0,70 - 74,NRW,64.95,54.97,366,0,3PLAY PREMIUM HRZ,0
1,35 - 39,HSN,44.98,44.98,100,0,RED INTERNET & PHONE CABLE U,1
2,35 - 39,NRW,54.98,54.98,30,0,RED INTERNET & PHONE CABLE U,1
3,35 - 39,HSN,209.84,209.84,30,0,VF CABLEMAX,1
4,25 - 29,NRW,63.98,63.98,123,0,GIGAZUHAUSE KABEL,1


# CTGAN

In [3]:
import sdv
import pandas as pd
from ctgan import CTGAN

In [4]:
categorical_columns = ['customer_age_class', 'region', 'internet_product_name']

for col in categorical_columns:
    data[col] = data[col].astype('category')
    
ctgan = CTGAN(epochs=30)

In [5]:
ctgan.fit(data, discrete_columns=categorical_columns)

  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


In [6]:
ctgan_data = ctgan.sample(10000)

In [7]:
ctgan_data.head()

Unnamed: 0,customer_age_class,region,basket_package_price_disc,internet_package_price_disc,basket_days_until_contract_end,count_previous_cancellations_0_18,internet_product_name,flag_churn
0,60 - 64,NRW,25.156373,46.210009,238,0,VF RIP U-TREUE,1
1,55 - 59,HSN,46.666848,45.983056,32,0,RED INTERNET & PHONE CABLE U,0
2,75 - 79,KBW,42.314876,2.452784,32,0,GIGAZUHAUSE KABEL,1
3,30 - 34,NRW,36.100965,34.884192,118,0,VF CABLEMAX,1
4,45 - 49,KBW,15.698746,45.184381,29,0,3PLAY FLY,1


In [8]:
ctgan_data.to_csv('ctgan_data.csv', index=False)

# TVAE

Problems

Couldn't find TVAE in sdv. Because sdv is old verioned and has been changed organizationally and the names as well. Used the dir to finf TVAE, whichis named TVAESynthesizer

In [21]:
print(dir(sdv.single_table))

['CTGANSynthesizer', 'CopulaGANSynthesizer', 'GaussianCopulaSynthesizer', 'TVAESynthesizer', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', 'base', 'copulagan', 'copulas', 'ctgan', 'utils']


In [9]:
import pandas as pd
from sdv.single_table import TVAESynthesizer
from sdv.metadata import SingleTableMetadata

In [10]:
metadata = SingleTableMetadata()

metadata.add_column('customer_age_class', sdtype='categorical')
metadata.add_column('region', sdtype='categorical')
metadata.add_column('internet_product_name', sdtype='categorical')

for column in data.select_dtypes(include=['int64', 'float64']).columns:
    metadata.add_column(column, sdtype='numerical')

In [11]:
model = TVAESynthesizer(metadata=metadata, epochs=10, batch_size=32)
model.fit(data)

In [12]:
tvae_data = model.sample(num_rows = 10000)

In [13]:
tvae_data

Unnamed: 0,customer_age_class,region,internet_product_name,basket_package_price_disc,internet_package_price_disc,basket_days_until_contract_end,count_previous_cancellations_0_18,flag_churn
0,80 and over,NRW,VF RIP U-TREUE,32.74,37.95,29,0,0
1,55 - 59,NRW,RED INTERNET & PHONE CABLE U,88.03,49.23,31,0,0
2,50 - 54,NRW,VF CABLEMAX,41.67,49.63,31,0,0
3,35 - 39,NRW,VF CABLEMAX,49.96,44.20,28,0,1
4,25 - 29,NRW,RED INTERNET & PHONE CABLE U,14.78,34.11,30,0,1
...,...,...,...,...,...,...,...,...
9995,25 - 29,NRW,VF CABLEMAX,48.33,45.85,29,0,1
9996,50 - 54,NRW,VF CABLEMAX,69.10,49.48,28,0,0
9997,35 - 39,NRW,RED INTERNET & PHONE CABLE U,41.50,35.21,30,0,0
9998,50 - 54,NRW,RED INTERNET & PHONE CABLE U,39.52,44.56,31,0,0


In [14]:
tvae_data.to_csv('tvae_data.csv', index=False)

# Synthpop

In [49]:
file_path = '../Data/data.csv'
data = pd.read_csv(file_path)

# Change the flag_churn from int to object
data['flag_churn'] = data['flag_churn'].astype('object')
data.head()

Unnamed: 0,customer_age_class,region,basket_package_price_disc,internet_package_price_disc,basket_days_until_contract_end,count_previous_cancellations_0_18,internet_product_name,flag_churn
0,70 - 74,NRW,64.95,54.97,366,0,3PLAY PREMIUM HRZ,0
1,35 - 39,HSN,44.98,44.98,100,0,RED INTERNET & PHONE CABLE U,1
2,35 - 39,NRW,54.98,54.98,30,0,RED INTERNET & PHONE CABLE U,1
3,35 - 39,HSN,209.84,209.84,30,0,VF CABLEMAX,1
4,25 - 29,NRW,63.98,63.98,123,0,GIGAZUHAUSE KABEL,1


In [51]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

def synthpop(data):
    synthetic_data = pd.DataFrame(columns=data.columns)
    encoders = {}
    
    for col in data.columns:
        predictors = data.drop(columns=[col])
        target=data[col]
        
        # Encode categorical columns in predictors
        for pred_col in predictors.select_dtypes(include=['object']).columns:
            if pred_col not in encoders:
                encoders[pred_col] = LabelEncoder()
                predictors[pred_col] = encoders[pred_col].fit_transform(predictors[pred_col])
            else:
                predictors[pred_col] = encoders[pred_col].transform(predictors[pred_col])
        
        # Encode target column if it is categorical
        if target.dtypes == 'object':
            if col not in encoders:
                encoders[col] = LabelEncoder()
                target = encoders[col].fit_transform(target)
            else:
                target = encoders[col].transform(target)
                
        # Align target index with predictors using boolean indexing
        predictors = predictors.dropna()
        target = pd.Series(target).iloc[predictors.index]
        
        # Choose model based on data type
        if target.dtype in ['int64', 'float64']:
            model = RandomForestRegressor()
        else:
            model = RandomForestClassifier
        
        model.fit(predictors, target)
        
        synthetic_col = model.predict(predictors)
        
        if col in encoders:
            synthetic_col = encoders[col].inverse_transform(synthetic_col.astype(int))
            
        synthetic_data[col] = synthetic_col
        
    return synthetic_data

In [52]:
synthetic_data = synthpop(data)
print(synthetic_data.shape)
synthetic_data.head()

Unnamed: 0,customer_age_class,region,basket_package_price_disc,internet_package_price_disc,basket_days_until_contract_end,count_previous_cancellations_0_18,internet_product_name,flag_churn
0,65 - 69,KBW,64.9377,52.4816,321.58,0.12,3PLAY TREUE GENRE,0
1,35 - 39,HSN,45.2492,44.98,42.196849,0.2,RIP CABLE U TREUE,0
2,50 - 54,KBW,55.03025,54.98,31.8145,0.0,TREUEPLUS,0
3,35 - 39,HSN,209.5564,205.6992,30.09,0.0,VF CABLEMAX,0
4,30 - 34,KBW,63.7601,63.4397,160.06481,0.04,GIGAZUHAUSE KABEL,0


In [54]:
synthetic_data.to_csv('synthpop_data.csv', index=False)

# DataSynthesizer

In [20]:
#https://github.com/DataResponsibly/DataSynthesizer/blob/master/notebooks/DataSynthesizer__random_mode.ipynb
from DataSynthesizer.DataDescriber import DataDescriber
from DataSynthesizer.DataGenerator import DataGenerator
from DataSynthesizer.ModelInspector import ModelInspector
from DataSynthesizer.lib.utils import read_json_file, display_bayesian_network

import pandas as pd

In [21]:
# An attribute is categorical if its domain size is less than this threshold.
# Here modify the threshold to adapt to the domain size of "education" (which is 14 in input dataset).
threshold_value = 70

# Number of tuples generated in synthetic dataset.
num_tuples_to_generate = 10000 # Here 32561 is the same as input dataset, but it can be set to another number.

In [22]:
input_data = '../Data/data.csv'
mode = 'random_mode'
description_file = f'./DataSynthesizer/out/{mode}/description.json'
synthetic_data = f'datasythesizer_data.csv'

In [23]:
describer = DataDescriber(category_threshold=threshold_value)
describer.describe_dataset_in_random_mode(input_data)
describer.save_dataset_description_to_file(description_file)

In [24]:
generator = DataGenerator()
generator.generate_dataset_in_random_mode(num_tuples_to_generate, description_file)
generator.save_synthetic_data(synthetic_data)