# Subject Table Definition

### Dataset import

In [29]:
import pandas as pd

In [30]:
# Specify the path to the CSV file
csv_file_path = 'C:/Users/david/Desktop/Davide/SyntheticData/Github/SyntheticData/data/test_dataset.csv'
# Load the CSV file into a DataFrame
df = pd.read_csv(csv_file_path)

In [31]:
# Create an empty dictionary
datasets = {}

# Add the DataFrame to the dictionary
datasets['test_dataset'] = df


In [32]:
datasets

{'test_dataset':                         NDG  Segment     Region  Age Range Gender     Income  \
 0      29077800288462831460       20  Lombardia         65      M  344940.00   
 1     629940137899049454465       30    Sicilia         65      F   94364.00   
 2     392586367169128742483       10     Veneto         65      M  116270.00   
 3     908525913488204920962       30  Lombardia         65      F  134755.00   
 4     432979353440523532437       40    Toscana         25      M   29621.25   
 ...                     ...      ...        ...        ...    ...        ...   
 9995  539391625009438288024       40     Veneto         55      M  161975.00   
 9996  313270091782835594768       10   Trentino         65      M   33204.00   
 9997  627847515718849033297       40  Lombardia         65      F  446515.00   
 9998  408427169752387416101       40     Veneto         55      F   19410.00   
 9999  883835472490109028594       20     Veneto         55      F   40037.50   
 
       Dur

### Dataset metadata

In [33]:
from sdv.metadata import MultiTableMetadata

In [34]:
metadata = MultiTableMetadata()

metadata.detect_table_from_dataframe(
    table_name='test_dataset',
    data=df
)

Correct metadata sdtypes

In [35]:
metadata.update_column(
    table_name='test_dataset',
    column_name='NDG',
    sdtype='id',
    regex_format = r'\d{21}'
)

metadata.update_column(
    table_name='test_dataset',
    column_name='Age Range',
    sdtype='categorical'
)

metadata.update_column(
    table_name='test_dataset',
    column_name='Segment',
    sdtype='categorical'
)

metadata.update_column(
    table_name='test_dataset',
    column_name='Income',
    sdtype='numerical',
    computer_representation='Float')

Primary Keys: These keys identify every row of the table. They must be unique to the entire table and other tables may refer to them.

In [36]:
metadata.set_primary_key(
    table_name='test_dataset',
    column_name='NDG'
)

In [37]:
#metadata
metadata.validate

<bound method MultiTableMetadata.validate of {
    "tables": {
        "test_dataset": {
            "columns": {
                "NDG": {
                    "sdtype": "id",
                    "regex_format": "\\d{21}"
                },
                "Segment": {
                    "sdtype": "categorical"
                },
                "Region": {
                    "sdtype": "categorical"
                },
                "Age Range": {
                    "sdtype": "categorical"
                },
                "Gender": {
                    "sdtype": "categorical"
                },
                "Income": {
                    "sdtype": "numerical",
                    "computer_representation": "Float"
                },
                "Duration": {
                    "sdtype": "numerical"
                }
            },
            "primary_key": "NDG"
        }
    },
    "relationships": [],
    "METADATA_SPEC_VERSION": "MULTI_TABLE_V1"
}>

Export metadata json

In [38]:
metadata.save_to_json('data/test_dataset_metadata.json')
metadata = MultiTableMetadata.load_from_json('data/test_dataset_metadata.json')

# Synthetic Data Generation

In [39]:
from sdv.multi_table import HMASynthesizer

In [40]:
synthesizer = HMASynthesizer(metadata)
synthesizer.validate(datasets)

Initialize the SDV model and fit it to the DataFrame:

In [41]:
synthesizer.fit(datasets)

Generate synthetic data using the SDV model:

In [42]:
synthetic_data = synthesizer.sample(scale=1)

In [43]:
synthetic_data

{'test_dataset':                         NDG  Segment     Region  Age Range Gender  \
 0     000000000000000000000       40     Veneto         35      F   
 1     000000000000000000001       10    Sicilia         25      F   
 2     000000000000000000002       10    Toscana         45      M   
 3     000000000000000000003       30    Sicilia         65      F   
 4     000000000000000000004       40   Trentino         45      F   
 ...                     ...      ...        ...        ...    ...   
 9995  000000000000000009995       40   Trentino         65      F   
 9996  000000000000000009996       20   Trentino         55      F   
 9997  000000000000000009997       30    Toscana         25      F   
 9998  000000000000000009998       10  Lombardia         25      F   
 9999  000000000000000009999       10  Lombardia         25      F   
 
              Income  Duration  
 0      76297.890936        19  
 1      18023.281533        32  
 2      46577.379555         9  
 3      28

In [44]:
datasets

{'test_dataset':                         NDG  Segment     Region  Age Range Gender     Income  \
 0      29077800288462831460       20  Lombardia         65      M  344940.00   
 1     629940137899049454465       30    Sicilia         65      F   94364.00   
 2     392586367169128742483       10     Veneto         65      M  116270.00   
 3     908525913488204920962       30  Lombardia         65      F  134755.00   
 4     432979353440523532437       40    Toscana         25      M   29621.25   
 ...                     ...      ...        ...        ...    ...        ...   
 9995  539391625009438288024       40     Veneto         55      M  161975.00   
 9996  313270091782835594768       10   Trentino         65      M   33204.00   
 9997  627847515718849033297       40  Lombardia         65      F  446515.00   
 9998  408427169752387416101       40     Veneto         55      F   19410.00   
 9999  883835472490109028594       20     Veneto         55      F   40037.50   
 
       Dur

# Export to csv

In [45]:
synthetic_data['test_dataset'].to_csv('data/test_dataset_synthetic.csv', index=False)