# Create synthetic mouse genome data

Create a synthetic version of the mouse genomes from the original experiment. To run this notebook, you will need an API key from the Gretel console,  at https://console.gretel.cloud.


In [1]:
%%capture
!pip install -U gretel-client

In [15]:
# Specify your Gretel API key

from getpass import getpass
import pandas as pd
from gretel_client import configure_session, ClientConfig

pd.set_option('max_colwidth', None)

configure_session(ClientConfig(api_key=getpass(prompt="Enter Gretel API key"), 
                               endpoint="https://api.gretel.cloud"))

                            

Enter Gretel API key········


In [16]:
# Create a project

from gretel_client import create_project

project = create_project(display_name="synthetic-mouse-genomes")

## Load and preview the training dataset
Specify a data source to train the model on. This can be a local file, web location, or HDFS file.


In [17]:
import logging
import os
import pathlib


training_min_rows = 25000
tmp_path = '/tmp/tmp_geno_train.csv'

base_path = pathlib.Path(os.getcwd().replace("/synthetics", ""))
data_path = base_path / 'data' 

dataset_path = data_path / 'geno_abBMD_train.csv'
seeds_path = data_path / 'geno_seeds.csv'

df = pd.read_csv(dataset_path)
dataset_rows = len(df)
df = pd.concat([df] * (training_min_rows // len(df) + 1))
df.to_csv(tmp_path, index=False)

logging.info(f"Original training dataset length: {dataset_rows} rows.")
logging.warning(f"Repeated {dataset_rows} row training dataset "
                f"to {len(df)} rows to help RNN learn structure.")

df



Unnamed: 0,rs27052855,rs257710525,rs6258876,rs49153109,rs29395706,rs49725879,rs6284806,rs27052698,rs255791755,rs49072129,rs29467625,rs27037903,rs50536616,rs240744127,rs27037855,rs27037853,rs29464487,abBMD,SW16
0,0,0,0,1,0,1,1,0,0,0,0,0,1,1,1,1,1,0.0,0.0
1,1,1,1,1,1,1,1,1,0,0,0,0,1,1,1,1,1,0.0,0.0
2,2,1,1,2,2,2,2,1,0,1,1,1,2,2,2,2,1,0.0,0.0
3,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,0.0,0.0
4,2,2,2,2,2,2,2,0,1,2,2,2,2,2,2,2,2,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
857,1,1,0,1,1,1,1,0,0,0,0,1,1,1,1,1,2,0.0,0.0
858,0,0,0,1,1,1,1,0,0,0,1,0,1,1,1,1,1,0.0,0.0
859,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.0,0.0
860,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0.0,0.0


## Configure model hyper parameters
Load the default configuration template. This template will work well for most datasets. View other templates at https://github.com/gretelai/gretel-blueprints/tree/main/config_templates/gretel/synthetics

In [18]:
import json
from smart_open import open
import yaml

with open("https://raw.githubusercontent.com/gretelai/gretel-blueprints/main/config_templates/gretel/synthetics/default.yml", 'r') as stream:
    config = yaml.safe_load(stream)
    
fields=['abBMD', 'SW16']

task = {
    'type': 'seed',
    'attrs': {
        'fields': fields
    }
}

# Optimize parameters for complex dataset
config['models'][0]['synthetics']['task'] = task
config['models'][0]['synthetics']['params']['epochs'] = 150
config['models'][0]['synthetics']['params']['vocab_size'] = 0
config['models'][0]['synthetics']['params']['rnn_units'] = 1024
config['models'][0]['synthetics']['params']['reset_states'] = False
config['models'][0]['synthetics']['params']['learning_rate'] = 0.001
config['models'][0]['synthetics']['generate']['num_records'] = dataset_rows
config['models'][0]['synthetics']['privacy_filters']['similarity'] = None

config['models'][0]['synthetics']['params']['vocab_size'] = 38
config['models'][0]['synthetics']['params']['rnn_units'] = 768
config['models'][0]['synthetics']['params']['reset_states'] = False
config['models'][0]['synthetics']['params']['learning_rate'] = 0.0016
config['models'][0]['synthetics']['params']['dropout_rate'] = 0.5645
config['models'][0]['synthetics']['params']['gen_temp'] = 0.9173


print(json.dumps(config, indent=2))

{
  "schema_version": "1.0",
  "models": [
    {
      "synthetics": {
        "data_source": "__tmp__",
        "params": {
          "epochs": 150,
          "batch_size": 64,
          "vocab_size": 38,
          "reset_states": false,
          "learning_rate": 0.0016,
          "rnn_units": 768,
          "dropout_rate": 0.5645,
          "overwrite": true,
          "early_stopping": true,
          "gen_temp": 0.9173,
          "predict_batch_size": 64,
          "validation_split": false,
          "dp": false,
          "dp_noise_multiplier": 0.001,
          "dp_l2_norm_clip": 5.0,
          "dp_microbatches": 1
        },
        "validators": {
          "in_set_count": 10,
          "pattern_count": 10
        },
        "generate": {
          "num_records": 862,
          "max_invalid": null
        },
        "privacy_filters": {
          "outliers": "medium",
          "similarity": null
        },
        "task": {
          "type": "seed",
          "attrs": {
     

## Train the synthetic model
In this step, we will task the worker running in the Gretel cloud, or locally, to train a synthetic model on the source dataset.

In [19]:
from gretel_client.helpers import poll


model = project.create_model_obj(model_config=config)
model.data_source = tmp_path
model.submit(upload_data_source=True)

poll(model)

[32mINFO: [0mStarting poller


{
    "uid": "6160b9d459114be50ec16f68",
    "model_name": "fancy-quirky-ostrich",
    "runner_mode": "cloud",
    "user_id": "5f45aedbbff62139017abfeb",
    "project_id": "6160b9abcdff71995f27f260",
    "status_history": {
        "created": "2021-10-08T21:36:20.693747Z"
    },
    "last_modified": "2021-10-08T21:36:20.929073Z",
    "status": "created",
    "last_active_hb": null,
    "duration_minutes": null,
    "error_msg": null,
    "error_id": null,
    "traceback": null,
    "container_image": "074762682575.dkr.ecr.us-west-2.amazonaws.com/gretelai/synthetics@sha256:5188b73e1fc582fde1b3d77cac52d03a5e26a7bcc59b68e52fd04f4a8501b7d0",
    "model_type": "synthetics",
    "config": {
        "schema_version": "1.0",
        "name": null,
        "models": [
            {
                "synthetics": {
                    "params": {
                        "field_delimiter": null,
                        "epochs": 150,
                        "batch_size": 64,
                       

[32mINFO: [0mStatus is created. Model creation has been queued.
[32mINFO: [0mStatus is pending. A Gretel Cloud worker is being allocated to begin model creation.
[32mINFO: [0mStatus is active. A worker has started creating your model!
2021-10-08T21:36:38.139943Z  Starting synthetic model training
2021-10-08T21:36:38.141956Z  Loading training data
2021-10-08T21:36:38.500919Z  Training data loaded
{
    "record_count": 25860,
    "field_count": 19
}
2021-10-08T21:36:47.430626Z  Creating semantic validators and preparing training data
2021-10-08T21:37:08.434763Z  Beginning ML model training
2021-10-08T21:37:34.513272Z  Training epoch completed
{
    "epoch": 0,
    "accuracy": 0.7134,
    "loss": 0.8334,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 0
}
2021-10-08T21:37:52.526779Z  Training epoch completed
{
    "epoch": 1,
    "accuracy": 0.8934,
    "loss": 0.2913,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 0
}
2021-10-08T21:38:10.661710Z  Training epoch compl

2021-10-08T21:46:57.114069Z  Saving model archive
2021-10-08T21:47:00.228051Z  Creating synthetic quality report
2021-10-08T21:47:15.793839Z  Uploading artifacts to Gretel Cloud
2021-10-08T21:47:17.828079Z  Model creation complete!


# View the synthetic data quality report

In [1]:
# Generate report that shows the statistical performance between the training and synthetic data

from smart_open import open
from IPython.core.display import display, HTML


# Change batch_num to any value between 0 and 6 to view performance report for other batches
display(HTML(data=open(model.get_artifact_link("report")).read(), metadata=dict(isolated=True)))

NameError: name 'model' is not defined

## Generate synthetic genome data using seed values from the synthetic phenome data
This ensures that the new synthetic genome data aligns one to one with the synthetic phenome data

In [21]:
seedfile = str(data_path / 'phenome_abBMD_seeds.csv')
seed_df = pd.read_csv(seedfile)

rh = model.create_record_handler_obj(data_source=seedfile, params={"num_records": len(seed_df)})
rh.submit_cloud()

poll(rh)

synthetic_genomes = pd.read_csv(rh.get_artifact_link("data"), compression='gzip')
synthetic_genomes

[32mINFO: [0mStarting poller


{
    "uid": "6160be45051af656772472e7",
    "model_name": null,
    "runner_mode": "cloud",
    "user_id": "5f45aedbbff62139017abfeb",
    "project_id": "6160b9abcdff71995f27f260",
    "status_history": {
        "created": "2021-10-08T21:55:17.684000Z"
    },
    "last_modified": "2021-10-08T21:55:17.831000Z",
    "status": "created",
    "last_active_hb": null,
    "duration_minutes": null,
    "error_msg": null,
    "error_id": null,
    "traceback": null,
    "container_image": "074762682575.dkr.ecr.us-west-2.amazonaws.com/gretelai/synthetics@sha256:5188b73e1fc582fde1b3d77cac52d03a5e26a7bcc59b68e52fd04f4a8501b7d0",
    "model_id": "6160b9d459114be50ec16f68",
    "action": "generate",
    "config": {
        "data_source": "gretel_7774550601904945ba7d28c0eb4d48a3_phenome_abBMD_seeds.csv",
        "params": {
            "num_records": 5000,
            "max_invalid": 10000
        }
    }
}


[32mINFO: [0mStatus is created. A Record generation job has been queued.
[32mINFO: [0mStatus is pending. A Gretel Cloud worker is being allocated to begin generating synthetic records.
[32mINFO: [0mStatus is active. A worker has started!
2021-10-08T21:55:34.809490Z  Loading model to worker
2021-10-08T21:55:34.810387Z  Attempting to load model from Gretel Cloud
2021-10-08T21:55:36.656544Z  Checking for synthetic smart seeds
2021-10-08T21:55:36.924274Z  Loaded 5000 smart seeds for generation
2021-10-08T21:55:36.925178Z  Loading model
2021-10-08T21:55:39.306851Z  Generating records
{
    "num_records": 5000
}
2021-10-08T21:55:44.321197Z  Generation in progress
{
    "current_valid_count": 37,
    "current_invalid_count": 1,
    "new_valid_count": 37,
    "new_invalid_count": 1,
    "completion_percent": 0.74
}
2021-10-08T21:55:49.328683Z  Generation in progress
{
    "current_valid_count": 88,
    "current_invalid_count": 6,
    "new_valid_count": 51,
    "new_invalid_count": 5,
   

2021-10-08T21:58:49.537008Z  Generation in progress
{
    "current_valid_count": 2124,
    "current_invalid_count": 200,
    "new_valid_count": 55,
    "new_invalid_count": 7,
    "completion_percent": 42.48
}
2021-10-08T21:58:54.543125Z  Generation in progress
{
    "current_valid_count": 2182,
    "current_invalid_count": 203,
    "new_valid_count": 58,
    "new_invalid_count": 3,
    "completion_percent": 43.64
}
2021-10-08T21:58:59.550012Z  Generation in progress
{
    "current_valid_count": 2239,
    "current_invalid_count": 209,
    "new_valid_count": 57,
    "new_invalid_count": 6,
    "completion_percent": 44.78
}
2021-10-08T21:59:04.556878Z  Generation in progress
{
    "current_valid_count": 2298,
    "current_invalid_count": 212,
    "new_valid_count": 59,
    "new_invalid_count": 3,
    "completion_percent": 45.96
}
2021-10-08T21:59:09.562426Z  Generation in progress
{
    "current_valid_count": 2358,
    "current_invalid_count": 215,
    "new_valid_count": 60,
    "new_inv

2021-10-08T22:02:09.763068Z  Generation in progress
{
    "current_valid_count": 4355,
    "current_invalid_count": 399,
    "new_valid_count": 56,
    "new_invalid_count": 3,
    "completion_percent": 87.1
}
2021-10-08T22:02:14.769033Z  Generation in progress
{
    "current_valid_count": 4411,
    "current_invalid_count": 402,
    "new_valid_count": 56,
    "new_invalid_count": 3,
    "completion_percent": 88.22
}
2021-10-08T22:02:19.774008Z  Generation in progress
{
    "current_valid_count": 4466,
    "current_invalid_count": 407,
    "new_valid_count": 55,
    "new_invalid_count": 5,
    "completion_percent": 89.32
}
2021-10-08T22:02:24.780343Z  Generation in progress
{
    "current_valid_count": 4522,
    "current_invalid_count": 410,
    "new_valid_count": 56,
    "new_invalid_count": 3,
    "completion_percent": 90.44
}
2021-10-08T22:02:29.786093Z  Generation in progress
{
    "current_valid_count": 4582,
    "current_invalid_count": 412,
    "new_valid_count": 60,
    "new_inva

Unnamed: 0,rs27052855,rs257710525,rs6258876,rs49153109,rs29395706,rs49725879,rs6284806,rs27052698,rs255791755,rs49072129,rs29467625,rs27037903,rs50536616,rs240744127,rs27037855,rs27037853,rs29464487,abBMD,SW16
0,1,2,2,2,2,2,2,1,0,1,2,1,2,2,2,2,1,0.0,0.0
1,2,1,2,2,1,2,2,1,0,2,1,1,2,2,2,2,2,0.0,0.0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.0,0.0
3,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0.0,0.0
4,0,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0
4996,0,0,1,1,1,1,1,0,0,1,0,0,1,1,1,1,1,0.0,0.0
4997,1,1,1,2,1,2,2,1,0,1,1,1,2,2,2,2,1,0.0,0.0
4998,2,1,2,2,2,2,2,1,1,1,2,2,2,2,2,2,1,0.0,0.0


In [22]:
# Drop the phenome information from the genome synth data and add back in the fields "id" and "discard"

id_col = []
discard_col = []
for i in range(len(synthetic_genomes.index)):
    id_col.append(i)
    discard_col.append("no")
    
synthetic_genomes["id"] = id_col
synthetic_genomes["discard"] = discard_col
synthetic_genomes = synthetic_genomes.drop(['abBMD', 'SW16'], axis=1)

## Save the synthetically generated genomes

In [23]:
synthetic_genomes.to_csv(data_path / 'synthetic_genomes.csv', index=False)