In [17]:
import pandas as pd
from ucimlrepo import fetch_ucirepo
from sdv.single_table import GaussianCopulaSynthesizer, CTGANSynthesizer
from sdv.evaluation.single_table import run_diagnostic, evaluate_quality, get_column_plot, get_column_pair_plot
from sdv.utils import load_synthesizer
from sdv.metadata import SingleTableMetadata
import os

In [2]:
adult = fetch_ucirepo(id=2) # Adult dataset

In [3]:
adult_df = pd.concat([adult.data.features, adult.data.targets], axis=1)
adult_df['income'] = adult_df['income'].str.strip().str.replace('.', '', regex=False)
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=adult_df)
print("Dataset loaded successfully.")

Dataset loaded successfully.


In [4]:
adult_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
print(adult_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48842 non-null  int64 
 1   workclass       47879 non-null  object
 2   fnlwgt          48842 non-null  int64 
 3   education       48842 non-null  object
 4   education-num   48842 non-null  int64 
 5   marital-status  48842 non-null  object
 6   occupation      47876 non-null  object
 7   relationship    48842 non-null  object
 8   race            48842 non-null  object
 9   sex             48842 non-null  object
 10  capital-gain    48842 non-null  int64 
 11  capital-loss    48842 non-null  int64 
 12  hours-per-week  48842 non-null  int64 
 13  native-country  48568 non-null  object
 14  income          48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB
None


In [6]:
print(adult_df.shape)

(48842, 15)


In [7]:

for column in adult_df.select_dtypes(include=['object']).columns:
    print(f"\nValue counts for column '{column}':")
    print(adult_df[column].value_counts())


Value counts for column 'workclass':
workclass
Private             33906
Self-emp-not-inc     3862
Local-gov            3136
State-gov            1981
?                    1836
Self-emp-inc         1695
Federal-gov          1432
Without-pay            21
Never-worked           10
Name: count, dtype: int64

Value counts for column 'education':
education
HS-grad         15784
Some-college    10878
Bachelors        8025
Masters          2657
Assoc-voc        2061
11th             1812
Assoc-acdm       1601
10th             1389
7th-8th           955
Prof-school       834
9th               756
12th              657
Doctorate         594
5th-6th           509
1st-4th           247
Preschool          83
Name: count, dtype: int64

Value counts for column 'marital-status':
marital-status
Married-civ-spouse       22379
Never-married            16117
Divorced                  6633
Separated                 1530
Widowed                   1518
Married-spouse-absent      628
Married-AF-spouse     

In [8]:
synthesizer = CTGANSynthesizer(metadata, cuda=False, verbose=True)
synthesizer.fit(adult_df)

Gen. (-0.45) | Discrim. (0.12): 100%|██████████| 300/300 [31:57<00:00,  6.39s/it] 


In [9]:
synthetic_data = synthesizer.sample(num_rows=len(adult_df))


print("Synthetic data generated successfully!")
print(synthetic_data.head())

Synthetic data generated successfully!
   age    workclass  fnlwgt     education  education-num      marital-status  \
0   20            ?  219061     Bachelors             13       Never-married   
1   28      Private  220221     Doctorate             15       Never-married   
2   28      Private  162109       HS-grad              9       Never-married   
3   39  Federal-gov  213377     Bachelors             10  Married-civ-spouse   
4   48      Private  206552  Some-college             10            Divorced   

        occupation   relationship                race     sex  capital-gain  \
0     Craft-repair      Own-child               White    Male             0   
1   Prof-specialty  Not-in-family               White    Male            14   
2   Prof-specialty      Own-child               White    Male            12   
3  Protective-serv        Husband  Amer-Indian-Eskimo    Male             2   
4     Adm-clerical      Unmarried               White  Female          4327   

   ca

In [11]:
diagnostic_report = run_diagnostic(
    real_data=adult_df,
    synthetic_data=synthetic_data,
    metadata=metadata
)

print("--- Diagnostic Report ---")
print(diagnostic_report)

Generating report ...

(1/2) Evaluating Data Validity: |██████████| 15/15 [00:00<00:00, 150.30it/s]|
Data Validity Score: 100.0%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 406.31it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 100.0%

--- Diagnostic Report ---
<sdmetrics.reports.single_table.diagnostic_report.DiagnosticReport object at 0x7fa6e56bb640>


In [12]:
quality_report = evaluate_quality(
    real_data=adult_df,
    synthetic_data=synthetic_data,
    metadata=metadata
)

print("\n--- Full Quality Report Summary ---")
print(quality_report)

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 15/15 [00:00<00:00, 55.39it/s]|
Column Shapes Score: 87.29%

(2/2) Evaluating Column Pair Trends: |██████████| 105/105 [00:01<00:00, 57.90it/s]|
Column Pair Trends Score: 81.93%

Overall Score (Average): 84.61%


--- Full Quality Report Summary ---
<sdmetrics.reports.single_table.quality_report.QualityReport object at 0x7fa6e56ba710>


In [16]:
output_dir = './images/CTGAN/1/'

# Create the directory if it doesn't already exist
os.makedirs(output_dir, exist_ok=True)
print(f"Saving images to '{output_dir}' directory...")

# Get the list of all column names
column_names = adult_df.columns

# Loop through each column name
for column in column_names:
    print(f"Generating and saving plot for column: '{column}'...")
    
    # Generate the plot for the current column
    fig = get_column_plot(
        real_data=adult_df,
        synthetic_data=synthetic_data,
        column_name=column,
        metadata=metadata
    )
    
    # Define the full file path for the image
    file_path = os.path.join(output_dir, f'{column}.png')
    
    # Save the figure to a file
    fig.write_image(file_path)

print("\nAll plots have been saved successfully.")

Saving images to './images/CTGAN/1/' directory...
Generating and saving plot for column: 'age'...


ValueError: 
Image export using the "kaleido" engine requires the Kaleido package,
which can be installed using pip:

    $ pip install --upgrade kaleido
