# Gretel Trainer

This notebook is designed to help users successfully train synthetic models on complex datasets with high row and column counts. The code works by intelligently dividing a dataset into a set of smaller datasets of correlated columns that can be parallelized and then stitched back together. 

In [None]:
import strategy
import runner

from gretel_client.projects import create_or_get_unique_project
from gretel_client.projects.models import read_model_config
from gretel_client.projects.jobs import Status
from gretel_synthetics.utils.header_clusters import cluster

import pandas as pd

In [None]:
MAX_ROWS = 50000  # Maximum row count per model
CLUSTER_SIZE = 20 # Max columns per cluster
CACHE_FILE = "runner.json"

PROJECT = create_or_get_unique_project(name="trainer-dataset")
print(f"Follow model training at: {PROJECT.get_console_url()}")

In [None]:
# Load the dataset to synthesize
DF = pd.read_csv("cpu_states.csv", low_memory=False)
DF

In [None]:
# Use header clustering to create a strategy for parallel training
# This may take a few minutes
HEADER_CLUSTERS = cluster(DF)
print(f"Initial dataset size: {DF.shape[0]} rows {DF.shape[1]} columns")
print(f"  --> Created {len(HEADER_CLUSTERS)} clusters for training with {len(HEADER_CLUSTERS[1])} columns each.")

In [None]:
# Load a default configuration from GitHub
CONFIG = read_model_config("synthetics/mostly-numeric-data")

CONFIG["models"][0]["synthetics"]["params"]["epochs"] = 200
CONFIG["models"][0]["synthetics"]["privacy_filters"] = {}
CONFIG["models"][0]["synthetics"]["privacy_filters"]["outliers"] = None
CONFIG["models"][0]["synthetics"]["privacy_filters"]["similarity"] = None

In [None]:
# First we need to create some constraints for the partition strategy, this will be used to create the specific
# partitions
#
# Params:
# - header_clusters: Any header clusters desired, if omitted, we'll use all headers
# - max_row_partitions: The max number of row "clusters" to use, mutually exclusive with `max_row_count`
# - max_row_count: The max number of records to include in a row cluster

constraints = strategy.PartitionConstraints(
    header_clusters=HEADER_CLUSTERS, 
    max_row_count=MAX_ROWS
)

In [None]:
# Create our job runner
run = runner.StrategyRunner(
    strategy_id="foo",
    df=DF,
    cache_file=CACHE_FILE,
    cache_overwrite=True,  # False means we'll try and load an existing cache and start back up, otherwise start fresh
    model_config=CONFIG,
    partition_constraints=constraints,
    project=PROJECT
)

In [None]:
# run = runner.StrategyRunner.from_completed(PROJECT, CACHE_FILE)

In [None]:
run.train_all_partitions()

In [None]:
synthetic = run.get_training_synthetic_data()
synthetic

In [None]:
# run.cancel_all()