<a target="_parent" href="https://colab.research.google.com/github/gretelai/gretel-blueprints/blob/main/docs/notebooks/safe-synthetics/running-standalone-evaluate.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

# 📊 Running Standalone Evaluate
This notebook allows you to run the Evaluate step with your own training and synthetic data. This is helpful if
- You want to compare the output from Gretel Synthetics to other means of generating synthetic data
- You want to make sure that the train/test split is consistent across multiple Safe Synthetics runs so that the scores are comparable

## 💾 Install Gretel SDK

In [None]:
%%capture
%pip install -U gretel-client

## 🌐 Configure your Gretel Session

In [None]:
# Set Gretel API key as an environment variable
import os
os.environ["GRETEL_API_KEY"] = "grtu...."

In [None]:
from gretel_client import create_or_get_unique_project
from gretel_client.config import get_session_config
from gretel_client.navigator_client import Gretel

gretel = Gretel()
project_name = "standalone-evaluate"
session = get_session_config()
project = create_or_get_unique_project(name=project_name, session=session)

project.get_console_url()

## 🔬 Load real and synthetic data

In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split

real_ds = "https://gretel-datasets.s3.us-west-2.amazonaws.com/hipaa_patients.csv"
synthetic_ds = "https://gretel-datasets.s3.us-west-2.amazonaws.com/synthetic_hipaa_patients.csv"
real_df = pd.read_csv(real_ds)
synthetic_df = pd.read_csv(synthetic_ds)

train_df, holdout_df = train_test_split(real_df, test_size=0.05, random_state=42)

print(f"Number of rows - train: {len(train_df)}")
print(f"Number of rows - holdout: {len(holdout_df)}")
print(f"Number of rows - synthetic: {len(synthetic_df)}")
train_df.head()

In [None]:
# Convert any Pandas Data Frames to Datasets
from gretel_client.files import FileClient

file_client = FileClient()

resp_train = file_client.upload(train_df, "dataset")
train_file_id = resp_train.id
resp_holdout = file_client.upload(holdout_df, "dataset")
holdout_file_id = resp_holdout.id
resp_synthetic = file_client.upload(synthetic_df, "dataset")
synthetic_file_id = resp_synthetic.id

## 🏃 Run Evaluate

In [None]:
import requests
import yaml

def run_workflow(config: str):
    """Create a workflow, and workflow run from a given yaml config. Blocks and
    prints log lines until the workflow reaches a terminal state.

    Args:
        config: The workflow config to run.
    """
    config_dict = yaml.safe_load(config)

    response = requests.post(
        f"{session.endpoint}/v2/workflows/exec_batch",
        json={
            "workflow_config": config_dict,
            "project_id": project.project_guid,
        },
        headers={"Authorization": session.api_key}
    )
    response_body = response.json()

    print(response_body)

    workflow_id = response_body["workflow_id"]
    workflow_run_id = response_body["workflow_run_id"]

    workflow_run_url = (
        f"{project.get_console_url().replace(project.project_guid, '')}workflows/"
        f"{workflow_id}/runs/{workflow_run_id}"
    )

    print(f"workflow: {workflow_id}")
    print(f"workflow run id: {workflow_run_id}")
    print(workflow_run_url)

In [None]:
eval_config = f"""
name: evaluate
version: "2"

steps:
  - name: holdout
    task: holdout
    inputs: [{train_file_id}, {holdout_file_id}]
    config: {{}}
  - name: eval
    task: evaluate_safe_synthetics_dataset
    inputs: [{synthetic_file_id}, "holdout"]
    config: {{}}
"""

run_workflow(eval_config)