# Subject Table Definition

### Dataset import

In [11]:
import pandas as pd

In [12]:
# Specify the path to the CSV file
csv_file_path = 'C:/Users/david/Desktop/Davide/SyntheticData/Github/SyntheticData/data/test_dataset.csv'
# Load the CSV file into a DataFrame
df = pd.read_csv(csv_file_path)

In [13]:
# Create an empty dictionary
datasets = {}

# Add the DataFrame to the dictionary
datasets['test_dataset'] = df


In [14]:
datasets

{'test_dataset':                         NDG  Segment     Region  Age Range Gender     Income  \
 0     726402800814062217560       40  Lombardia         65      F  113838.75   
 1     889583567071317812454       40   Trentino         65      F  186538.50   
 2     530630346698908304055       10  Lombardia         35      M  195732.50   
 3     528364642665697783653       20   Trentino         25      M   46722.00   
 4     527645717531677479155       40    Toscana         65      M   81958.80   
 ...                     ...      ...        ...        ...    ...        ...   
 9995  970347933603133106111       40     Veneto         45      M  149925.00   
 9996  930273368294131315307       20    Toscana         65      M  144211.65   
 9997  447183733060950082862       10   Trentino         25      F  118972.50   
 9998  494600646125903229884       20   Trentino         45      F   23001.00   
 9999  296791268150097416112       20     Veneto         45      M   83766.00   
 
       Dur

### Dataset metadata

In [15]:
from sdv.metadata import MultiTableMetadata

In [16]:
metadata = MultiTableMetadata()

metadata.detect_table_from_dataframe(
    table_name='test_dataset',
    data=df
)

Correct metadata sdtypes

In [17]:
metadata.update_column(
    table_name='test_dataset',
    column_name='NDG',
    sdtype='id',
    regex_format = r'\d{21}'
)

metadata.update_column(
    table_name='test_dataset',
    column_name='Age Range',
    sdtype='categorical'
)

metadata.update_column(
    table_name='test_dataset',
    column_name='Segment',
    sdtype='categorical'
)

metadata.update_column(
    table_name='test_dataset',
    column_name='Income',
    sdtype='numerical',
    computer_representation='Float')

Primary Keys: These keys identify every row of the table. They must be unique to the entire table and other tables may refer to them.

In [18]:
metadata.set_primary_key(
    table_name='test_dataset',
    column_name='NDG'
)

In [19]:
#metadata
metadata.validate

<bound method MultiTableMetadata.validate of {
    "tables": {
        "test_dataset": {
            "columns": {
                "NDG": {
                    "sdtype": "id",
                    "regex_format": "\\d{21}"
                },
                "Segment": {
                    "sdtype": "categorical"
                },
                "Region": {
                    "sdtype": "categorical"
                },
                "Age Range": {
                    "sdtype": "categorical"
                },
                "Gender": {
                    "sdtype": "categorical"
                },
                "Income": {
                    "sdtype": "numerical",
                    "computer_representation": "Float"
                },
                "Duration": {
                    "sdtype": "numerical"
                }
            },
            "primary_key": "NDG"
        }
    },
    "relationships": [],
    "METADATA_SPEC_VERSION": "MULTI_TABLE_V1"
}>

Export metadata json

In [20]:
metadata.save_to_json('data/test_dataset_metadata.json')
metadata = MultiTableMetadata.load_from_json('data/test_dataset_metadata.json')

# Synthetic Data Generation

In [21]:
from sdv.multi_table import HMASynthesizer

In [22]:
synthesizer = HMASynthesizer(metadata)
synthesizer.validate(datasets)

Initialize the SDV model and fit it to the DataFrame:

In [23]:
synthesizer.fit(datasets)

Generate synthetic data using the SDV model:

In [24]:
synthetic_data = synthesizer.sample(scale=1)

In [25]:
synthetic_data

{'test_dataset':                         NDG  Segment     Region  Age Range Gender  \
 0     000000000000000000000       30    Toscana         45      M   
 1     000000000000000000001       20  Lombardia         35      M   
 2     000000000000000000002       20    Sicilia         55      F   
 3     000000000000000000003       40    Sicilia         35      M   
 4     000000000000000000004       30     Veneto         55      M   
 ...                     ...      ...        ...        ...    ...   
 9995  000000000000000009995       20    Toscana         55      M   
 9996  000000000000000009996       30     Veneto         65      F   
 9997  000000000000000009997       30    Sicilia         65      M   
 9998  000000000000000009998       40     Veneto         35      M   
 9999  000000000000000009999       20  Lombardia         65      M   
 
              Income  Duration  
 0      47987.377229        27  
 1      33511.002646        13  
 2      44768.226382        10  
 3      56

In [26]:
datasets

{'test_dataset':                         NDG  Segment     Region  Age Range Gender     Income  \
 0     726402800814062217560       40  Lombardia         65      F  113838.75   
 1     889583567071317812454       40   Trentino         65      F  186538.50   
 2     530630346698908304055       10  Lombardia         35      M  195732.50   
 3     528364642665697783653       20   Trentino         25      M   46722.00   
 4     527645717531677479155       40    Toscana         65      M   81958.80   
 ...                     ...      ...        ...        ...    ...        ...   
 9995  970347933603133106111       40     Veneto         45      M  149925.00   
 9996  930273368294131315307       20    Toscana         65      M  144211.65   
 9997  447183733060950082862       10   Trentino         25      F  118972.50   
 9998  494600646125903229884       20   Trentino         45      F   23001.00   
 9999  296791268150097416112       20     Veneto         45      M   83766.00   
 
       Dur

# Export to csv

In [28]:
synthetic_data['test_dataset'].to_csv('data/test_dataset_synthetic.csv', index=False)