# Create synthetic mouse phenome data

Create a synthetic version of the mouse phenomes from the original experiment, which are available after running `01_create_phenome_training_data.ipynb`. To run this notebook, you will need an API key from the Gretel console,  at https://console.gretel.cloud.


In [1]:
%%capture
!pip install -U gretel-client

In [2]:
# Specify your Gretel API key

from getpass import getpass
import pandas as pd
from gretel_client import configure_session, ClientConfig

pd.set_option('max_colwidth', None)

configure_session(ClientConfig(api_key=getpass(prompt="Enter Gretel API key"), 
                               endpoint="https://api.gretel.cloud"))

                            

Enter Gretel API key········


In [3]:
# Create a project

from gretel_client import create_project

project = create_project(display_name="synthetic-mouse-phenomes")

## Configure model hyper parameters
Load the default configuration template. This template will work well for most datasets. View other templates at https://github.com/gretelai/gretel-blueprints/tree/main/config_templates/gretel/synthetics

In [7]:
import json
from smart_open import open
import yaml

with open("https://raw.githubusercontent.com/gretelai/gretel-blueprints/main/config_templates/gretel/synthetics/default.yml", 'r') as stream:
    config = yaml.safe_load(stream)

# Optimize parameters for complex dataset
config['models'][0]['synthetics']['params']['epochs'] = 150
config['models'][0]['synthetics']['params']['vocab_size'] = 0
config['models'][0]['synthetics']['params']['rnn_units'] = 1024
config['models'][0]['synthetics']['params']['reset_states'] = False
config['models'][0]['synthetics']['params']['learning_rate'] = 0.001
config['models'][0]['synthetics']['privacy_filters']['similarity'] = None

print(json.dumps(config, indent=2))

{
  "schema_version": "1.0",
  "models": [
    {
      "synthetics": {
        "data_source": "__tmp__",
        "params": {
          "epochs": 150,
          "batch_size": 64,
          "vocab_size": 0,
          "reset_states": false,
          "learning_rate": 0.001,
          "rnn_units": 1024,
          "dropout_rate": 0.2,
          "overwrite": true,
          "early_stopping": true,
          "gen_temp": 1.0,
          "predict_batch_size": 64,
          "validation_split": false,
          "dp": false,
          "dp_noise_multiplier": 0.001,
          "dp_l2_norm_clip": 5.0,
          "dp_microbatches": 1
        },
        "validators": {
          "in_set_count": 10,
          "pattern_count": 10
        },
        "generate": {
          "num_records": 5000,
          "max_invalid": null
        },
        "privacy_filters": {
          "outliers": "medium",
          "similarity": null
        }
      }
    }
  ]
}


## Load and preview the training dataset
Specify a data source to train the model on. This can be a local file, web location, or HDFS file.


In [8]:
import os
import pathlib


base_path = pathlib.Path(os.getcwd().replace("/synthetics", ""))
data_path = base_path / 'mice_data_set' / 'data'
dataset_path = data_path / 'phenomes_batch_0.csv'
    
df = pd.read_csv(dataset_path)
df

Unnamed: 0,BMD,SW16,abBMD,TA,tibia,EDL,soleus,plantaris,gastroc,SW6,sacweight,SW17,testisweight,methage,PPIweight,bw1
0,1.88,0.0,0.0,62.3,18.06,13.2,8.5,18.8,154.6,0.0,35.7,0.0,0.17,54.0,35.0,29.8
1,1.89,0.0,0.0,54.1,18.14,11.2,6.8,17.6,143.6,0.0,34.1,0.0,0.19,54.0,32.2,28.7
2,1.95,0.0,0.0,56.5,18.18,12.9,8.7,17.7,148.6,0.0,41.8,0.0,0.20,54.0,37.5,30.6
3,1.92,0.0,0.0,64.0,18.35,14.1,9.2,20.6,157.4,0.0,39.5,0.0,0.19,54.0,37.7,31.8
4,1.89,0.0,0.0,65.7,18.24,13.9,9.2,20.5,167.0,0.0,36.0,0.0,0.18,54.0,35.8,32.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39750,1.85,0.0,0.0,57.8,18.44,13.9,7.7,18.1,150.4,0.0,37.8,0.0,0.20,54.0,33.0,30.5
39751,1.92,0.0,0.0,64.3,18.92,13.1,10.2,15.1,149.6,0.0,43.8,0.0,0.19,54.0,37.1,34.4
39752,1.84,0.0,0.0,53.7,18.53,10.6,10.8,17.0,140.7,0.0,38.9,0.0,0.18,54.0,34.6,33.1
39753,1.88,0.0,0.0,63.5,18.44,14.1,7.4,21.5,158.3,0.0,41.0,0.0,0.20,54.0,37.2,34.1


## Train the synthetic model
In this step, we will task the worker running in the Gretel cloud, or locally, to train a synthetic model on the source dataset.

In [None]:
from gretel_client.helpers import poll


config['models'][0]['synthetics']['generate']['num_records'] = len(df)

model = project.create_model_obj(model_config=config)
model.data_source = str(dataset_path)
model.submit(upload_data_source=True)

poll(model)

[32mINFO: [0mStarting poller


{
    "uid": "61553d2aa142f2df3053e23b",
    "model_name": "enchanted-enthusiastic-horse",
    "runner_mode": "cloud",
    "user_id": "600f5e11bff62132eb718849",
    "project_id": "615525efadc689d99009f887",
    "logs": null,
    "status_history": {
        "created": "2021-09-30T04:29:30.719643Z"
    },
    "last_modified": "2021-09-30T04:29:30.897137Z",
    "status": "created",
    "last_active_hb": null,
    "duration_minutes": null,
    "error_msg": null,
    "error_id": null,
    "traceback": null,
    "container_image": "074762682575.dkr.ecr.us-west-2.amazonaws.com/gretelai/synthetics@sha256:42c01cffaa364d53cfc00e91e5df33050451079e6c12b62a99c438f7dbe6743d",
    "model_type": "synthetics",
    "config": {
        "schema_version": "1.0",
        "name": null,
        "models": [
            {
                "synthetics": {
                    "params": {
                        "field_delimiter": null,
                        "epochs": 150,
                        "batch_size": 6

[32mINFO: [0mStatus is created. Model creation has been queued.
[32mINFO: [0mStatus is pending. A Gretel Cloud worker is being allocated to begin model creation.
[32mINFO: [0mStatus is active. A worker has started creating your model!
2021-09-30T04:29:44.284651Z  Starting synthetic model training
2021-09-30T04:29:44.286610Z  Loading training data
2021-09-30T04:29:44.500191Z  Training data loaded
{
    "record_count": 39755,
    "field_count": 16
}
2021-09-30T04:29:58.901456Z  Creating semantic validators and preparing training data
2021-09-30T04:30:25.147786Z  Beginning ML model training
2021-09-30T04:31:48.712383Z  Training epoch completed
{
    "epoch": 0,
    "accuracy": 0.6347,
    "loss": 1.0761,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 0
}
2021-09-30T04:33:03.115537Z  Training epoch completed
{
    "epoch": 1,
    "accuracy": 0.8067,
    "loss": 0.544,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 0
}
2021-09-30T04:34:19.933798Z  Training epoch comple

# Save the synthetically generated phenomes

In [None]:
# View the synthetic data

synthetic_phenomes = pd.read_csv(model.get_artifact_link("data_preview"), compression='gzip')
synthetic_phenomes.to_csv(data_path / 'synthetic_phenomes_batch_0.csv', index=False)
synthetic_phenomes

# View the synthetic data quality report

In [None]:
# Generate report that shows the statistical performance between the training and synthetic data

import IPython
from smart_open import open

IPython.display.HTML(data=open(model.get_artifact_link("report")).read())