# Create synthetic mouse genome data

Create a synthetic version of the mouse genomes from the original experiment. To run this notebook, you will need an API key from the Gretel console,  at https://console.gretel.cloud.


In [None]:
%%capture
!pip install -U gretel-client

In [None]:
# Specify your Gretel API key

from getpass import getpass
import pandas as pd
from gretel_client import configure_session, ClientConfig

pd.set_option('max_colwidth', None)

configure_session(ClientConfig(api_key=getpass(prompt="Enter Gretel API key"), 
                               endpoint="https://api.gretel.cloud"))

                            

In [None]:
# Create a project

from gretel_client import create_project

project = create_project(display_name="synthetic-mouse-genomes")

## Load and preview the training dataset
Specify a data source to train the model on. This can be a local file, web location, or HDFS file.


In [None]:
 data_path / "geno_abBMD_train.csv"

In [None]:
import logging
import os
import pathlib


training_min_rows = 25000
tmp_path = '/tmp/tmp_geno_train.csv'

base_path = pathlib.Path(os.getcwd().replace("/synthetics", ""))
data_path = base_path / 'mice_data_set' / 'data'

dataset_path = data_path / 'geno_abBMD_train.csv'
seeds_path = data_path / 'geno_seeds.csv'

df = pd.read_csv(dataset_path)
dataset_rows = len(df)
df = pd.concat([df] * (training_min_rows // len(df) + 1))
df.to_csv(tmp_path, index=False)

logging.info(f"Original training dataset length: {dataset_rows} rows.")
logging.warning(f"Repeated {dataset_rows} row training dataset "
                f"to {len(df)} rows to help RNN learn structure.")

df

## Configure model hyper parameters
Load the default configuration template. This template will work well for most datasets. View other templates at https://github.com/gretelai/gretel-blueprints/tree/main/config_templates/gretel/synthetics

In [None]:
import json
from smart_open import open
import yaml

with open("https://raw.githubusercontent.com/gretelai/gretel-blueprints/main/config_templates/gretel/synthetics/default.yml", 'r') as stream:
    config = yaml.safe_load(stream)
    
fields=['abBMD', 'SW16']

task = {
    'type': 'seed',
    'attrs': {
        'fields': fields
    }
}

# Optimize parameters for complex dataset
config['models'][0]['synthetics']['task'] = task
config['models'][0]['synthetics']['params']['epochs'] = 150
config['models'][0]['synthetics']['params']['vocab_size'] = 38
config['models'][0]['synthetics']['params']['rnn_units'] = 768
config['models'][0]['synthetics']['params']['reset_states'] = False
config['models'][0]['synthetics']['params']['learning_rate'] = 0.0016
config['models'][0]['synthetics']['generate']['num_records'] = dataset_rows
config['models'][0]['synthetics']['privacy_filters']['similarity'] = None
config['models'][0]['synthetics']['params']['dropout_rate'] = 0.5645
config['models'][0]['synthetics']['params']['gen_temp'] = 0.9173


print(json.dumps(config, indent=2))

## Train the synthetic model
In this step, we will task the worker running in the Gretel cloud, or locally, to train a synthetic model on the source dataset.

In [None]:
from gretel_client.helpers import poll


model = project.create_model_obj(model_config=config)
model.data_source = tmp_path
model.submit(upload_data_source=True)

poll(model)

# View the synthetic data quality report

In [None]:
# Generate report that shows the statistical performance between the training and synthetic data

from smart_open import open
from IPython.core.display import display, HTML


# Change batch_num to any value between 0 and 6 to view performance report for other batches
display(HTML(data=open(model.get_artifact_link("report")).read(), metadata=dict(isolated=True)))

## Generate synthetic genome data using seed values from the synthetic phenome data
This ensures that the new synthetic genome data aligns one to one with the synthetic phenome data

In [None]:
seedfile = str(data_path / 'phenome_abBMD_seeds.csv')
seed_df = pd.read_csv(seedfile)

rh = model.create_record_handler_obj(data_source=seedfile, params={"num_records": len(seed_df)})
rh.submit_cloud()

poll(rh)

synthetic_genomes = pd.read_csv(rh.get_artifact_link("data"), compression='gzip')
synthetic_genomes

In [None]:
# Drop the phenome information from the genome synth data and add back in the fields "id" and "discard"

id_col = []
discard_col = []
for i in range(len(synthetic_genomes.index)):
    id_col.append(i)
    discard_col.append("no")

synthetic_genomes = synthetic_genomes.drop(['abBMD', 'SW16'], axis=1)
columns = ['id', 'discard']
columns = columns + list(synthetic_genomes.columns)   
synthetic_genomes["id"] = id_col
synthetic_genomes["discard"] = discard_col
synthetic_genomes = synthetic_genomes.filter(columns)

In [None]:
synthetic_genomes

## Save the synthetically generated genomes

In [None]:
synthetic_genomes.to_csv(data_path / 'synthetic_genomes.txt', index=False, sep=' ')