# Create synthetic mouse genome data

Create a synthetic version of the mouse genomes from the original experiment. To run this notebook, you will need an API key from the Gretel console,  at https://console.gretel.cloud.


In [1]:
%%capture
!pip install -U gretel-client

In [2]:
# Specify your Gretel API key

from getpass import getpass
import pandas as pd
from gretel_client import configure_session, ClientConfig

pd.set_option('max_colwidth', None)

configure_session(ClientConfig(api_key=getpass(prompt="Enter Gretel API key"), 
                               endpoint="https://api.gretel.cloud"))

                            

Enter Gretel API key········


In [10]:
# Create a project

from gretel_client import create_project

project = create_project(display_name="synthetic-mouse-genomes")

## Configure model hyper parameters
Load the default configuration template. This template will work well for most datasets. View other templates at https://github.com/gretelai/gretel-blueprints/tree/main/config_templates/gretel/synthetics

## Load and preview the training dataset
Specify a data source to train the model on. This can be a local file, web location, or HDFS file.


In [34]:
import logging
import os
import pathlib


training_min_rows = 25000
tmp_path = '/tmp/tmp_geno_train.csv'

base_path = pathlib.Path(os.getcwd().replace("/synthetics", ""))
data_path = base_path / 'mice_data_set' / 'data' 

dataset_path = data_path / 'geno_train.csv'
seeds_path = data_path / 'geno_seeds.csv'

df = pd.read_csv(dataset_path)
dataset_rows = len(df)
df = pd.concat([df] * (training_min_rows // len(df) + 1))
df.to_csv(tmp_path, index=False)

logging.info(f"Original training dataset length: {dataset_rows} rows.")
logging.warning(f"Repeated {dataset_rows} row training dataset "
                f"to {len(df)} rows to help RNN learn structure.")

df



Unnamed: 0,abBMD,SW16,rs29477109,rs27070938,rs259190588,rs26992391,rs29435046,rs227486741,rs27070712,rs27037798,rs49725879,rs27037823,rs29467625,rs27052855,rs29391151,rs51014101,rs6258876,rs46637219,rs27045535
0,0,0,1,1,0,1,1,1,1,0,1,1,0,1,0,0,1,0,1
1,0,0,1,1,1,1,1,1,1,1,2,2,1,2,1,1,1,1,0
2,0,0,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1
3,0,0,2,2,1,2,2,2,2,2,2,2,2,2,2,1,2,2,2
4,0,0,1,1,0,1,1,1,1,0,1,1,1,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
614,0,0,1,1,1,2,2,2,2,1,1,1,0,1,1,0,0,1,1
615,0,0,1,1,1,1,0,1,1,1,1,1,1,0,0,0,0,1,0
616,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
617,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [44]:
import json
from smart_open import open
import yaml

with open("https://raw.githubusercontent.com/gretelai/gretel-blueprints/main/config_templates/gretel/synthetics/default.yml", 'r') as stream:
    config = yaml.safe_load(stream)
    
fields=['abBMD', 'SW16']

task = {
    'type': 'seed',
    'attrs': {
        'fields': fields
    }
}

# Optimize parameters for complex dataset
config['models'][0]['synthetics']['task'] = task
config['models'][0]['synthetics']['params']['epochs'] = 150
config['models'][0]['synthetics']['params']['vocab_size'] = 0
config['models'][0]['synthetics']['params']['rnn_units'] = 1024
config['models'][0]['synthetics']['params']['reset_states'] = False
config['models'][0]['synthetics']['params']['learning_rate'] = 0.001
config['models'][0]['synthetics']['generate']['num_records'] = dataset_rows
config['models'][0]['synthetics']['privacy_filters']['similarity'] = None


print(json.dumps(config, indent=2))

{
  "schema_version": "1.0",
  "models": [
    {
      "synthetics": {
        "data_source": "__tmp__",
        "params": {
          "epochs": 150,
          "batch_size": 64,
          "vocab_size": 0,
          "reset_states": false,
          "learning_rate": 0.001,
          "rnn_units": 1024,
          "dropout_rate": 0.2,
          "overwrite": true,
          "early_stopping": true,
          "gen_temp": 1.0,
          "predict_batch_size": 64,
          "validation_split": false,
          "dp": false,
          "dp_noise_multiplier": 0.001,
          "dp_l2_norm_clip": 5.0,
          "dp_microbatches": 1
        },
        "validators": {
          "in_set_count": 10,
          "pattern_count": 10
        },
        "generate": {
          "num_records": 619,
          "max_invalid": null
        },
        "privacy_filters": {
          "outliers": "medium",
          "similarity": null
        },
        "task": {
          "type": "seed",
          "attrs": {
            

## Train the synthetic model
In this step, we will task the worker running in the Gretel cloud, or locally, to train a synthetic model on the source dataset.

In [None]:
from gretel_client.helpers import poll


model = project.create_model_obj(model_config=config)
model.data_source = tmp_path
model.submit(upload_data_source=True)

poll(model)

[32mINFO: [0mStarting poller


{
    "uid": "615542a154e620c38d70e9d4",
    "model_name": "ambitious-adorable-giraffe",
    "runner_mode": "cloud",
    "user_id": "600f5e11bff62132eb718849",
    "project_id": "6155069d3e47327d5166b54e",
    "logs": null,
    "status_history": {
        "created": "2021-09-30T04:52:49.083369Z"
    },
    "last_modified": "2021-09-30T04:52:49.262132Z",
    "status": "created",
    "last_active_hb": null,
    "duration_minutes": null,
    "error_msg": null,
    "error_id": null,
    "traceback": null,
    "container_image": "074762682575.dkr.ecr.us-west-2.amazonaws.com/gretelai/synthetics@sha256:42c01cffaa364d53cfc00e91e5df33050451079e6c12b62a99c438f7dbe6743d",
    "model_type": "synthetics",
    "config": {
        "schema_version": "1.0",
        "name": null,
        "models": [
            {
                "synthetics": {
                    "params": {
                        "field_delimiter": null,
                        "epochs": 150,
                        "batch_size": 64,

[32mINFO: [0mStatus is created. Model creation has been queued.
[32mINFO: [0mStatus is pending. A Gretel Cloud worker is being allocated to begin model creation.
[32mINFO: [0mStatus is active. A worker has started creating your model!
2021-09-30T04:53:04.927792Z  Starting synthetic model training
2021-09-30T04:53:04.929630Z  Loading training data
2021-09-30T04:53:05.151652Z  Training data loaded
{
    "record_count": 25379,
    "field_count": 19
}
2021-09-30T04:53:14.452139Z  Creating semantic validators and preparing training data
2021-09-30T04:53:35.508525Z  Beginning ML model training
2021-09-30T04:54:18.035405Z  Training epoch completed
{
    "epoch": 0,
    "accuracy": 0.702,
    "loss": 0.7729,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 0
}
2021-09-30T04:54:45.186605Z  Training epoch completed
{
    "epoch": 1,
    "accuracy": 0.8649,
    "loss": 0.3433,
    "val_accuracy": 0,
    "val_loss": 0,
    "batch": 0
}
2021-09-30T04:55:13.554763Z  Training epoch comple

# Save the synthetically generated genomes

In [None]:
# Use our original model seed file to generate new synthetic data

synthetic_genomes = pd.read_csv(model.get_artifact_link("data_preview"), compression='gzip')
synthetic_genomes.to_csv(data_path / 'synthetic_genomes.csv', index=False)
synthetic_genomes

# View the synthetic data quality report

In [None]:
# Generate report that shows the statistical performance between the training and synthetic data

import IPython
from smart_open import open

IPython.display.HTML(data=open(model.get_artifact_link("report")).read())