In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import os

# Loading

In [2]:
from sdv.datasets.demo import download_demo
from sdv.multi_table import HMASynthesizer
from sdv.evaluation.multi_table import run_diagnostic, evaluate_quality, get_column_plot
from IPython.display import display

In [3]:
# Load demo dataset
real_data, metadata = download_demo(
    modality='multi_table',
    dataset_name='fake_hotels'
)

In [4]:
print("Real Hotels Data:")
real_data['hotels'].head(3)

Real Hotels Data:


Unnamed: 0,hotel_id,city,state,rating,classification
0,HID_000,Boston,Massachusetts,4.8,RESORT
1,HID_001,Boston,Massachuesetts,4.1,CHAIN
2,HID_002,San Francisco,California,3.8,MOTEL


In [5]:
print("Real Guests Data:")
real_data['guests'].head(3)

Real Guests Data:


Unnamed: 0,guest_email,hotel_id,has_rewards,room_type,amenities_fee,checkin_date,checkout_date,room_rate,billing_address,credit_card_number
0,awolf@phillips.com,HID_000,False,BASIC,37.89,27 Dec 2020,28 Dec 2020,156.23,"993 Rebecca Landing\nJesseburgh, PA 05072",4075084747483975747
1,tonya44@wilkinson-wilkins.com,HID_000,False,BASIC,24.37,30 Dec 2020,31 Dec 2020,139.43,"958 Beverly Bypass\nSouth Ronald, GA 46368",180072822063468
2,harriskathleen@goodwin.com,HID_000,True,DELUXE,0.0,17 Sep 2020,19 Sep 2020,403.33,"8302 Nathaniel Pike\nRileyland, TX 71613",38983476971380


# Training

In [6]:
synthesizer = HMASynthesizer(metadata)
synthesizer.fit(real_data)

Preprocess Tables: 100%|█████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 18.10it/s]



Learning relationships:


(1/1) Tables 'hotels' and 'guests' ('hotel_id'): 100%|█████████████████████████████████| 10/10 [00:02<00:00,  3.98it/s]





Modeling Tables: 100%|███████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  3.56it/s]


In [7]:
os.makedirs("models", exist_ok=True)  # Ensure the folder exists
with open("models/hma_synthesizer.pkl", "wb") as f:
    pickle.dump(synthesizer, f)

# Generate Synthetic Data

In [8]:
synthetic_data = synthesizer.sample(scale=2)

In [9]:
print("Synthetic Hotels Data:")
synthetic_data['hotels'].head(3)

Synthetic Hotels Data:


Unnamed: 0,hotel_id,city,state,rating,classification
0,HID_001,New York City,Massachusetts,4.4,CHAIN
1,HID_000,Los Angeles,New York,4.1,MOTEL
2,HID_009,San Francisco,Massachusetts,4.6,CHAIN


In [10]:
print("Synthetic Guests Data:")
synthetic_data['guests'].head(3)

Synthetic Guests Data:


Unnamed: 0,guest_email,hotel_id,has_rewards,room_type,amenities_fee,checkin_date,checkout_date,room_rate,billing_address,credit_card_number
0,moodyeric@example.net,HID_001,False,BASIC,19.2,04 Feb 2020,07 Feb 2020,107.89,"PSC 4273, Box 0888\nAPO AA 28423",5161033759518983
1,coryguzman@example.com,HID_001,False,BASIC,19.31,19 Jul 2020,15 May 2020,74.2,"417 Lawrence Hollow\nEast Ericshire, IN 65660",4133047413145475690
2,caitlinlee@example.net,HID_001,False,BASIC,29.7,18 Mar 2020,15 Apr 2020,87.09,"69754 Mcguire Haven Apt. 260\nCrawfordside, IN...",4977328103788


In [11]:
# Diagnostics
diagnostic = run_diagnostic(real_data, synthetic_data, metadata)
print("Diagnostic Report:", diagnostic)

Generating report ...

(1/3) Evaluating Data Validity: |███████████████████████████████████████████████████| 15/15 [00:00<00:00, 1671.04it/s]|
Data Validity Score: 100.0%

(2/3) Evaluating Data Structure: |█████████████████████████████████████████████████████| 2/2 [00:00<00:00, 500.13it/s]|
Data Structure Score: 100.0%

(3/3) Evaluating Relationship Validity: |██████████████████████████████████████████████| 1/1 [00:00<00:00, 225.35it/s]|
Relationship Validity Score: 100.0%

Overall Score (Average): 100.0%

Diagnostic Report: <sdmetrics.reports.multi_table.diagnostic_report.DiagnosticReport object at 0x000001F3A4BCA9B0>


In [12]:
# Quality evaluation
quality_report = evaluate_quality(real_data, synthetic_data, metadata)
print("Quality Report:", quality_report.get_details('Column Shapes'))

Generating report ...

(1/4) Evaluating Column Shapes: |███████████████████████████████████████████████████| 15/15 [00:00<00:00, 1041.42it/s]|
Column Shapes Score: 85.05%

(2/4) Evaluating Column Pair Trends: |███████████████████████████████████████████████| 55/55 [00:00<00:00, 629.92it/s]|
Column Pair Trends Score: 71.46%

(3/4) Evaluating Cardinality: |████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 250.41it/s]|
Cardinality Score: 70.0%

(4/4) Evaluating Intertable Trends: |████████████████████████████████████████████████| 50/50 [00:00<00:00, 465.66it/s]|
Intertable Trends Score: 74.99%

Overall Score (Average): 75.37%

Quality Report:     Table          Column        Metric     Score
0  guests     has_rewards  TVComplement  0.949088
1  guests       room_type  TVComplement  0.968085
2  guests   amenities_fee  KSComplement  0.833403
3  guests    checkin_date  KSComplement  0.949848
4  guests   checkout_date  KSComplement  0.934169
5  guests       room_rate 

In [13]:
# Column-level comparison plot
fig = get_column_plot(
    real_data=real_data,
    synthetic_data=synthetic_data,
    column_name='has_rewards',
    table_name='guests',
    metadata=metadata
)
fig.show()