# Optuna Hyperparameter Tuning for Mice Synthetic Genome Data
* This is an optional notebook that will help you decide on the best synthetic training parameters to use.
* This notebooks requires you to have run these three previous notebooks first: 01_build_phenome_training_data, 02_create_synthetic_mouse_phenomes and 03_build_genome_training_data.
* The notebook 03_build_genome_training_data lets you pick a list of genome batches that you'd like to tune on.

In [None]:
%%capture
!pip install -U gretel-client

In [None]:
!pip install optuna

## Get started by authenticating

In [1]:
# Specify your Gretel API key

from getpass import getpass
import pandas as pd
from gretel_client import configure_session, ClientConfig

pd.set_option('max_colwidth', None)

configure_session(ClientConfig(api_key=getpass(prompt="Enter Gretel API key"), 
                               endpoint="https://api.gretel.cloud"))


Enter Gretel API key········


## Define your workspace

In [2]:
import os
import pathlib
import pandas as pd

base_path = pathlib.Path(os.getcwd().replace("/synthetics", ""))
data_path = base_path / 'mice_data_set' / 'data' 

## Create a synthetic training config structure

In [3]:
# Grab the default Synthetic Config file:

from smart_open import open
import yaml

with open("https://raw.githubusercontent.com/gretelai/gretel-blueprints/main/config_templates/gretel/synthetics/default.yml", 'r') as stream:
    config = yaml.safe_load(stream)

In [4]:
# Turn the Privacy Filters off

config['models'][0]['synthetics']['privacy_filters']['outliers'] = None
config['models'][0]['synthetics']['privacy_filters']['similarity'] = None

In [5]:
# Create a seed task

fields=['abBMD', 'SW16']

task = {
    'type': 'seed',
    'attrs': {
        'fields': fields
    }
}

## Read in training batches and seed data

In [6]:
# Read in the list of training batches
# This can be created in the 03_build_genome_training_data notebook

filename = data_path / "batch_training_list.csv"
training_list_df = pd.read_csv(filename)
batches = list(training_list_df["batch"])
batch_cnt = len(batches)

In [7]:
# Read in the phenome seed df
# This is created in the 02_create_synthetic_mouse_phenomes notebook

seedfile = str(data_path / 'phenome_abBMD_seeds.csv')
seed_df = pd.read_csv(seedfile)

## Create an Optuna objective function
* This objective function is what Optuna calls each time/trial.  It picks the next set of params
to use, creates and trains a model, generates synthetic data using the phenome seeds, and then
runs GWAS to determine an F1 score. When we create the Optuna study, we'll tell it we want
to maximize the result of this objective function (F1).

In [92]:
from gretel_client import projects
from gretel_client import create_project
import time
import optuna
from gretel_client.config import RunnerMode
from gretel_client.helpers import poll
from random import randint

def objective(trial: optuna.Trial):
 
    # Set which params you want to tune
    config['models'][0]['synthetics']['params']['vocab_size'] = trial.suggest_int(name="vocab_size", low=0, high=50000, step=5000)
    config['models'][0]['synthetics']['params']['rnn_units'] = trial.suggest_int(name="rnn_units", low=64, high=1024, step=64)
    config['models'][0]['synthetics']['params']['dropout_rate'] = trial.suggest_float("dropout_rate", .1, .75)
    config['models'][0]['synthetics']['params']['gen_temp'] = trial.suggest_float("gen_temp", .8, 1.2)
    config['models'][0]['synthetics']['params']['learning_rate'] = trial.suggest_float("learning_rate",  .001, 0.01, log=True)
    config['models'][0]['synthetics']['params']['reset_states'] = trial.suggest_categorical(
        "reset_states", choices=[True, False])

    # Create a new project
    seconds = int(time.time())
    project_name = "Tuning Experiment" + str(seconds)
    project = create_project(display_name=project_name)
    
    # Pick a random dataset from the training list, use just the first 34 pos ones
    batch_pos = randint(0, 33)
    batch = batches[batch_pos]
    batchfile = "geno_abBMD_batch" + str(batch) + "_train.csv"
    filename = str(data_path / "genome_training_data" / batchfile)
    print("\n\nTraining_set: " + str(batchfile))
    
    # Create a model 
    
    model = project.create_model_obj(model_config=config)
    model.data_source = filename
    model.submit(upload_data_source=True)
        
    # Watch for completion
                
    status = "active"

    while ((status == "active") or (status == "pending")):
        #Sleep a bit here
        time.sleep(60)
            
        model._poll_job_endpoint()
        status = model.__dict__['_data']['model']['status']
        print("Model status: " + str(status))
        if ((status == "error") or (status == "lost")):
            return 0
    print("\nModel training complete!")
        
    # Now generate synthetic data using the seeds
    
    rh = model.create_record_handler_obj(data_source=seedfile, params={"num_records": len(seed_df)})
    rh.submit_cloud()
    
    # Watch for its completion
    
    status = rh.__dict__['_data']['billing_data']['job_status']
    while ((status == "active") or (status == "pending")):
        #Sleep a bit here
        time.sleep(60)        
        rh._poll_job_endpoint()
        status = rh.__dict__['_data']['billing_data']['job_status']
        print("Model " + model_name + " has generating status: " + status)
        
    print("Model generation complete!")

    synthetic_genomes = pd.read_csv(rh.get_artifact_link("data"), compression='gzip')
    
    # Drop the phenome information from the genome synth data and add back in the fields "id" and "discard"

    id_col = []
    discard_col = []
    for i in range(len(synthetic_genomes.index)):
        id_col.append(i)
        discard_col.append("no")

    synthetic_genomes = synthetic_genomes.drop(['abBMD', 'SW16'], axis=1)
    columns = ['id', 'discard']
    columns = columns + list(synthetic_genomes.columns)   
    synthetic_genomes["id"] = id_col
    synthetic_genomes["discard"] = discard_col
    synthetic_genomes = synthetic_genomes.filter(columns)
    
    # Save the synthetic data
    
    synthetic_genomes.to_csv(data_path / 'synthetic_genomes.txt', index=False, sep=' ')
    
    # Save the associated map file
    mapfile = "map_abBMD_batch" + str(batch) + ".txt"
    filename = str(data_path / "genome_map_data" / mapfile)
    print("\nUsing map file: " + mapfile)
    map_df = pd.read_csv(filename, sep=' ')
    mapfile = "map_abBMD.txt"
    filename = str(data_path / mapfile)
    map_df.to_csv(filename, sep=' ', header=True, index=False)    

    # Compute the F1
    
    f1 = computeF1()
            
    return f1

In [9]:
def computeF1():
    
    from sklearn.metrics import f1_score

    base_path = pathlib.Path(os.getcwd().replace("/synthetics", ""))
    data_path = base_path / 'mice_data_set' / 'data' 
    real_gwas_path = base_path / 'mice_data_set' / 'out' 
    synthetic_gwas_path = base_path / 'mice_data_set' / 'out_synth'
   
    # Run GWAS
    !rm ../mice_data_set/out_synth/*.csv
    print("\nStarting GWAS run!")
    !R --vanilla < ../research_paper_code/src/map_gwas_batch.R &> /tmp/map.logerr
    
    # Read in the original results
    PHENOTYPE = 'abBMD'
    BATCH = 'batch1'
    real_snps = pd.read_csv(real_gwas_path / f'lm_{PHENOTYPE}_1_79646.csv') #, usecols=['snp', 'p']) # , usecols=['snp', 'p']
    real_snps = real_snps.rename(columns={real_snps.columns[0]: 'index'})
    real_snps = real_snps[['index', 'snp', 'p']]
    real_snps['interest'] = real_snps['p'].apply(lambda x: True if x <= 1e-8 else False)
    
    # Read in the new results
    synthetic_snps = pd.read_csv(synthetic_gwas_path / f'lm_{PHENOTYPE}.csv')  
    synthetic_snps = synthetic_snps.rename(columns={synthetic_snps.columns[0]: 'index'})
    synthetic_snps = synthetic_snps[['index', 'snp', 'p']]
    synthetic_snps['interest'] = synthetic_snps['p'].apply(lambda x: True if x <= 1e-8 else False)
     
    combined = pd.merge(synthetic_snps, 
         real_snps, 
         how='inner', 
         on=['snp'],
         suffixes=['_synthetic', '_real']) 
    
    f1 = f1_score(combined['interest_real'], combined['interest_synthetic'], average='weighted') 
    print("\nComputed F1: " + str(f1))
                                          
    return f1                                                                              
                                                

## Create your Optuna study and set off one trial
* Refer to these guidelines when deciding what sampling algorithm to use: https://optuna.readthedocs.io/en/stable/tutorial/10_key_features/003_efficient_optimization_algorithms.html#sphx-glr-tutorial-10-key-features-003-efficient-optimization-algorithms-py
* To be able to run mutliple Optuna trials in parallel, you must specfiy an RDB storage location.  Here we set storage="sqlite:///tuning.db" as sqlite commonly comes preinstalled with most operating systems.  No need to create the tuning.db first, if it doesn't exist, Optuna will create it.

In [10]:
# Run this cell in first notebook
study = optuna.create_study(study_name="illumina_batch1_tunings13",sampler=optuna.samplers.RandomSampler(),storage="sqlite:///tuning.db", direction="maximize")

study.optimize(objective, n_trials=1)

print(f"Optimized GWAS F1: {study.best_value:.5f}")

print("Best params:")
for key, value in study.best_params.items():
    print(f"\t{key}: {value}")

[32m[I 2021-11-07 15:54:44,193][0m A new study created in RDB with name: illumina_batch1_tunings13[0m




Training_set: geno_abBMD_batch1339_train.csv
Model status: active
Model status: active
Model status: active
Model status: active
Model status: active
Model status: active
Model status: completed

Model training complete!


[32mINFO: [0mStarting poller


{
    "uid": "6187f870ea1c8c9c26e8d13e",
    "model_name": null,
    "runner_mode": "cloud",
    "user_id": "5f05d810492fbf248648cb37",
    "project_id": "6187f6c47ee5e9266521a5df",
    "status_history": {
        "created": "2021-11-07T16:01:52.669000Z"
    },
    "last_modified": "2021-11-07T16:01:52.749000Z",
    "status": "created",
    "last_active_hb": null,
    "duration_minutes": null,
    "error_msg": null,
    "error_id": null,
    "traceback": null,
    "container_image": "074762682575.dkr.ecr.us-east-2.amazonaws.com/gretelai/synthetics@sha256:64bf7fd86286af74453bd1e6afb8fcb7daf6bc1ef752107e5e69af346abcb2c9",
    "model_id": "6187f6c52ce4ebea93e14a6e",
    "action": "generate",
    "config": {
        "data_source": "gretel_24026f97a4ae444fb714b29af9d40603_phenome_abBMD_seeds.csv",
        "params": {
            "num_records": 5000,
            "max_invalid": 10000
        }
    }
}


[32mINFO: [0mStatus is created. A Record generation job has been queued.
[32mINFO: [0mStatus is pending. A Gretel Cloud worker is being allocated to begin generating synthetic records.
[32mINFO: [0mStatus is active. A worker has started!
2021-11-07T16:02:03.506803Z  Loading model to worker
2021-11-07T16:02:04.087536Z  Checking for synthetic smart seeds
2021-11-07T16:02:04.088435Z  Model configuration does not have any seed tasks, ignoring seed data
2021-11-07T16:02:04.088983Z  Loading model
2021-11-07T16:02:06.056611Z  Generating records
{
    "num_records": 5000
}
2021-11-07T16:02:11.063190Z  Generation in progress
{
    "current_valid_count": 0,
    "current_invalid_count": 0,
    "new_valid_count": 0,
    "new_invalid_count": 0,
    "completion_percent": 0.0
}
2021-11-07T16:02:16.081574Z  Generation in progress
{
    "current_valid_count": 396,
    "current_invalid_count": 179,
    "new_valid_count": 396,
    "new_invalid_count": 179,
    "completion_percent": 7.92
}
2021-11-0


Generation using seeds complete!

Using map file: map_abBMD_batch1339.txt

Starting GWAS run!

R version 3.6.1 (2019-07-05) -- "Action of the Toes"
Copyright (C) 2019 The R Foundation for Statistical Computing
Platform: x86_64-conda_cos6-linux-gnu (64-bit)

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.

  Natural language support but running in an English locale

R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.

Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.

> #!/usr/bin/env Rscript
> 
> args = commandArgs(trailingOnly=TRUE)
> 
> base_dir        <- "/home/amy/GitHub/synthetic-data-genomics"
> r_base          <- "research_paper_code"
> experiment_dir  <- "mic

[32m[I 2021-11-07 16:03:06,553][0m Trial 0 finished with value: 0.8166666666666667 and parameters: {'vocab_size': 0, 'rnn_units': 448, 'dropout_rate': 0.6770292042369933, 'gen_temp': 1.184879764203135, 'learning_rate': 0.0011381493913694292, 'reset_states': True}. Best is trial 0 with value: 0.8166666666666667.[0m



Computed F1: 0.8166666666666667
Optimized GWAS F1: 0.81667
Best params:
	dropout_rate: 0.6770292042369933
	gen_temp: 1.184879764203135
	learning_rate: 0.0011381493913694292
	reset_states: True
	rnn_units: 448
	vocab_size: 0


## Start running multiple Optuna trials in parallel
* To do this, replicate this notebook for however many processes you want to have running in parallel.
* In these new notebooks, all the cells will be exactly the same, except you'll remove the above cell that creates the Optuna study and add a new cell (see below) that loads in an existing study and starts new trials

In [None]:
# Continue optimizing.  This is the cell to use for subsequent notebooks running in parallel

study=optuna.load_study(study_name="illumina_batch1_tunings13",storage="sqlite:///tuning.db")

study.optimize(objective,n_trials=30)

## Use this first notebooks to monitor the tuning progress
* We only created one trial in this first notebook so it would complete quickly and leave the notebooks available for monitoring purposes

In [126]:
# How many trials have been sucessfully run so far
len(study.trials)

241

In [125]:
# What is the best trial number
study.best_trial.number

28

In [127]:
# What are the params in the best trial 
study.trials[28].params

{'dropout_rate': 0.24809907352672161,
 'gen_temp': 0.9527334971101686,
 'learning_rate': 0.006615889116029917,
 'reset_states': False,
 'rnn_units': 832,
 'vocab_size': 30000}

In [128]:
# Another way of asking for the best params
study.best_params

{'dropout_rate': 0.24809907352672161,
 'gen_temp': 0.9527334971101686,
 'learning_rate': 0.006615889116029917,
 'reset_states': False,
 'rnn_units': 832,
 'vocab_size': 30000}

In [11]:
# Load the plethora of visualization options available in Optuna

from optuna.visualization import plot_contour
from optuna.visualization import plot_edf
from optuna.visualization import plot_intermediate_values
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_param_importances
from optuna.visualization import plot_slice


In [116]:
plot_optimization_history(study)

In [122]:
plot_parallel_coordinate(study)

In [118]:
plot_contour(study)

In [119]:
plot_slice(study)

In [99]:
plot_param_importances(study)