# Generate Synthetic Data using Gretel's Databricks Connector

- This notebook demonstrates how to use Gretel Workflows and Gretel's Databricks Connector to read data from Databricks, generate synthetic data, and write the synthetic data back to Databricks

- To run this notebook, you will ned an API key from the [Gretel Console](https://console.gretel.ai/), as well as the connection parameters specified in the [Databricks Connector docs](https://docs.gretel.ai/create-synthetic-data/workflows-and-connectors/connectors/data-warehouse/databricks#permissions).

### Getting Started

In [0]:
pip install gretel-client

In [0]:
import logging
import yaml
from getpass import getpass

from gretel_client import create_or_get_unique_project
from gretel_client.config import configure_session, get_session_config
from gretel_client.rest_v1.api.connections_api import ConnectionsApi
from gretel_client.rest_v1.api.logs_api import LogsApi
from gretel_client.rest_v1.api.workflows_api import WorkflowsApi
from gretel_client.rest_v1.models import (
    CreateConnectionRequest,
    CreateWorkflowRunRequest,
    CreateWorkflowRequest,
)
from gretel_client.workflows.logs import print_logs_for_workflow_run

logging.basicConfig()
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

# Set up of API's needed to run workflows
setup = configure_session(api_key="prompt")
session = get_session_config()

connection_api = session.get_v1_api(ConnectionsApi)
workflow_api = session.get_v1_api(WorkflowsApi)
log_api = session.get_v1_api(LogsApi)

project = create_or_get_unique_project(name="workflow-testing")

project.get_console_url()

### Creating Connections

In [0]:
""" 
Creates source and destination connections for databricks
"""

source_conn = connection_api.create_connection(
    CreateConnectionRequest(
        name="databricks-source",
        project_id=project.project_guid,
        type="databricks",
        config={
            "server_hostname": input('Source Connection(server_hostname):'),
            "http_path": input('Source Connection(http_path):'),
            "catalog": input('Source Connection(catalog):'),
            "schema": input('Source Connection(schema):'),
        },
        credentials={
            "personal_access_token": getpass(prompt='Source Connection(Personal Access Token (PAT)):')
        },
    )
)

dest_conn = connection_api.create_connection(
    CreateConnectionRequest(
        name="databricks-dest",
        project_id=project.project_guid,
        type="databricks",
        config={
            "server_hostname": input('Destination Connection(server_hostname):'),
            "http_path": input('Destination Connection(http_path):'),
            "catalog": input('Destination Connection(catalog):'),
            "schema": input('Destination Connection(schema):'),
        },
        credentials={
            "personal_access_token": getpass(prompt='Destination Connection(Personal Access Token (PAT)): ')
        },
    )
)

### Creating Workflow

In [0]:
""" 
Sample config for a Gretel Workflow that
1. Reads data from databricks
2. Generates synthetic data using our ACTGAN (https://docs.gretel.ai/create-synthetic-data/models/synthetics/gretel-actgan) model.
3. Writes generated synthetic data back to a Databricks Destination

Note: volume name can be edited in 'databricks-destination' action
"""

workflow_config = yaml.safe_load(f"""
name: my-databricks-workflow
actions:
  - name: databricks-read
    type: databricks_source
    connection: {source_conn.id}
    config:
      sync:
        mode: subset
        algorithm: contiguous
        target_row_count: 1000
  - name: model-train-run
    type: gretel_tabular
    input: databricks-read
    config:
      project_id: {project.project_guid}
      train:
        model_config:
          schema_version: "1.0"
          name: tabular-actgan
          models:
            - actgan:
                data_source: __tmp__
                params:
                  epochs: auto
                  generator_dim:
                    - 1024
                    - 1024
                  discriminator_dim:
                    - 1024
                    - 1024
                  generator_lr: 0.0001
                  discriminator_lr: 0.00033
                  batch_size: auto
                  auto_transform_datetimes: false
                generate:
                  num_records: 5000
                privacy_filters:
                  outliers: null
                  similarity: null
        dataset: "{{outputs.databricks-read.dataset}}"
      run:
        num_records_multiplier: 1
  - name: databricks-write
    type: databricks_destination
    connection: {dest_conn.id}
    input: model-train-run
    config:
      sync:
        mode: replace
      dataset: "{{outputs.model-train-run.dataset}}"
      volume: "{input("Provide name for the volume: ")}"

"""
)

# Creates a workflow with the config above
workflow = workflow_api.create_workflow(
    CreateWorkflowRequest(
        name="Databricks E2E Demo",
        project_id=project.project_guid,
        config=workflow_config,
    )
)

### Running Workflow

In [0]:
# Kicks off a run of the workflow created
workflow_run = workflow_api.create_workflow_run(
    CreateWorkflowRunRequest(workflow_id=workflow.id)
)

print_logs_for_workflow_run(workflow_run.id, session)