# MultiTable

## Set up source relational data

In [None]:
# Display the schema of our demo database

from IPython.display import Image

Image("https://gretel-blueprints-pub.s3.us-west-2.amazonaws.com/rdb/ecommerce_db.png", width=600, height=600)

In [None]:
# Download the demo database

!curl -o "ecom_xf.db" "https://gretel-blueprints-pub.s3.us-west-2.amazonaws.com/rdb/ecom_xf.db"

In [None]:
# Connect to SQLite database and extract relational data

from gretel_trainer.relational.connectors import SQLite

ecommerce_db_path = "sqlite:///ecom_xf.db"

sqlite = SQLite(db_path=ecommerce_db_path, out_dir="./out")
relational_data = sqlite.extract()

In [None]:
# Alternatively, manually define relational_data

from gretel_trainer.relational.core import RelationalData
import pandas as pd

csv_dir = "/path/to/extracted_csvs"

rel_data = RelationalData()

rel_data.add_table("events", "id", pd.read_csv(f"{csv_dir}/events.csv"))
rel_data.add_table("users", "id", pd.read_csv(f"{csv_dir}/users.csv"))
rel_data.add_table("inventory_items", "id", pd.read_csv(f"{csv_dir}/inventory_items.csv"))
rel_data.add_table("products", "id", pd.read_csv(f"{csv_dir}/products.csv"))
rel_data.add_table("distribution_center", "id", pd.read_csv(f"{csv_dir}/distribution_center.csv"))
rel_data.add_table("order_items", "id", pd.read_csv(f"{csv_dir}/order_items.csv"))

rel_data.add_foreign_key("events.user_id", "users.id")
rel_data.add_foreign_key("order_items.user_id", "users.id")
rel_data.add_foreign_key("order_items.inventory_item_id", "inventory_items.id")
rel_data.add_foreign_key("inventory_items.product_id", "products.id")
rel_data.add_foreign_key("inventory_items.product_distribution_center_id", "distribution_center.id")
rel_data.add_foreign_key("products.distribution_center_id", "distribution_center.id")


## Do things with the source data!

In [None]:
from gretel_trainer.relational.multi_table import MultiTable

multitable = MultiTable(relational_data=relational_data)

In [None]:
# Transform some tables

xform_out = multitable.transform(
    configs={
        "users": "https://gretel-blueprints-pub.s3.amazonaws.com/rdb/users_policy.yaml",
        "events": "https://gretel-blueprints-pub.s3.amazonaws.com/rdb/events_policy.yaml",
    }, 
    in_place=False
)

In [None]:
# Compare original to transformed

print(multitable.relational_data.get_table_data("users").head(5))
print(xform_out["users"].head(5))

In [None]:
# Train synthetic models for all tables

multitable.train()

The `products` and `inventory_items` tables from the ecommerce dataset contain a column (`sku`) containing only null / None / NaN values, which consistently leads to ACTGAN training failures. The next cell demonstrates how to provide a modified version of a table and retrain _just that table_ without throwing away and retraining the other tables that were successful. (For demo simplicity, we'll simply drop the `sku` column.)

In [None]:
original_products = multitable.relational_data.get_table_data("products")
modified_products = original_products.drop(columns=["sku"])
multitable.retrain_with_table("products", modified_products)

original_inventory_items = multitable.relational_data.get_table_data("inventory_items")
modified_inventory_items = original_inventory_items.drop(columns=["sku"])
multitable.retrain_with_table("inventory_items", modified_inventory_items)

In [None]:
synthetic_tables = multitable.generate()