# Create synthetic mouse genome data

Create a synthetic version of the mouse genomes from the original experiment. To run this notebook, you will need an API key from the Gretel console,  at https://console.gretel.cloud.


In [None]:
%%capture
!pip install -U gretel-client

In [1]:
# Specify your Gretel API key

from getpass import getpass
import pandas as pd
from gretel_client import configure_session, ClientConfig

pd.set_option('max_colwidth', None)

configure_session(ClientConfig(api_key=getpass(prompt="Enter Gretel API key"), 
                               endpoint="https://api.gretel.cloud"))

                            

Enter Gretel API key········


In [2]:
# Create a project

from gretel_client import create_project

project = create_project(display_name="synthetic-mouse-genomes")

## Load and preview the training datasets
We're training one model on an example batch with high abBMD associations and one model on an example batch with low abBMD associations.


In [3]:
import logging
import os
import pathlib

training_min_rows = 25000
base_path = pathlib.Path(os.getcwd().replace("/synthetics", ""))
data_path = base_path / 'mice_data_set' / 'data'
batches = []

# Replicate training set one to have a minimum of 25000 examples

tmp_path = '/tmp/tmp_geno_highassoc_train.csv'
dataset_path = data_path / 'geno_abBMD_highassoc_train.csv'

df_highassoc = pd.read_csv(dataset_path)
dataset_rows = len(df_highassoc)
df_highassoc = pd.concat([df_highassoc] * (training_min_rows // len(df_highassoc) + 1))
df_highassoc.to_csv(tmp_path, index=False)
batches.append(str(tmp_path))

logging.info(f"Original training dataset length: {dataset_rows} rows.")
logging.warning(f"Repeated {dataset_rows} row training dataset one "
                f"to {len(df_highassoc)} rows to help RNN learn structure.")

print("Example rows from high association training batch:\n")
#print(df_highassoc.iloc[0:5,])
print(df_highassoc[:5])

# Replicate training set two to have a minimum of 25000 examples

tmp_path = '/tmp/tmp_geno_lowassoc_train.csv'
dataset_path = data_path / 'geno_abBMD_lowassoc_train.csv'

df_lowassoc = pd.read_csv(dataset_path)
dataset_rows = len(df_lowassoc)
df_lowassoc = pd.concat([df_lowassoc] * (training_min_rows // len(df_lowassoc) + 1))
df_lowassoc.to_csv(tmp_path, index=False)
batches.append(str(tmp_path))

logging.info(f"Original training dataset length: {dataset_rows} rows.")
logging.warning(f"Repeated {dataset_rows} row training dataset two "
                f"to {len(df_lowassoc)} rows to help RNN learn structure.")

print("\n\nExample rows from low association training batch:\n")
#print(df_lowassoc.iloc[0:5,])
print(df_lowassoc[:5])




Example rows from high association training batch:

   rs27052855  rs257710525  rs6258876  rs49153109  rs29395706  rs49725879  \
0           0            0          0           1           0           1   
1           1            1          1           1           1           1   
2           2            1          1           2           2           2   
3           1            1          1           1           1           1   
4           2            2          2           2           2           2   

   rs6284806  rs27052698  rs255791755  rs49072129  rs29467625  rs27037903  \
0          1           0            0           0           0           0   
1          1           1            0           0           0           0   
2          2           1            0           1           1           1   
3          1           1            1           0           1           1   
4          2           0            1           2           2           2   

   rs50536616  rs24074

## Configure model hyper parameters
Load the default configuration template. This template will work well for most datasets. View other templates at https://github.com/gretelai/gretel-blueprints/tree/main/config_templates/gretel/synthetics

In [4]:
import json
from smart_open import open
import yaml

with open("https://raw.githubusercontent.com/gretelai/gretel-blueprints/main/config_templates/gretel/synthetics/default.yml", 'r') as stream:
    config = yaml.safe_load(stream)
    
fields=['abBMD', 'SW16']

task = {
    'type': 'seed',
    'attrs': {
        'fields': fields
    }
}

# Optimize parameters for complex dataset
config['models'][0]['synthetics']['task'] = task
config['models'][0]['synthetics']['params']['epochs'] = 150
config['models'][0]['synthetics']['params']['vocab_size'] = 38
config['models'][0]['synthetics']['params']['rnn_units'] = 768
config['models'][0]['synthetics']['params']['reset_states'] = False
config['models'][0]['synthetics']['params']['learning_rate'] = 0.0016
config['models'][0]['synthetics']['generate']['num_records'] = dataset_rows
config['models'][0]['synthetics']['privacy_filters']['similarity'] = None
config['models'][0]['synthetics']['params']['dropout_rate'] = 0.5645
config['models'][0]['synthetics']['params']['gen_temp'] = 0.9173


print(json.dumps(config, indent=2))

{
  "schema_version": "1.0",
  "models": [
    {
      "synthetics": {
        "data_source": "__tmp__",
        "params": {
          "epochs": 150,
          "batch_size": 64,
          "vocab_size": 38,
          "reset_states": false,
          "learning_rate": 0.0016,
          "rnn_units": 768,
          "dropout_rate": 0.5645,
          "overwrite": true,
          "early_stopping": true,
          "gen_temp": 0.9173,
          "predict_batch_size": 64,
          "validation_split": false,
          "dp": false,
          "dp_noise_multiplier": 0.001,
          "dp_l2_norm_clip": 5.0,
          "dp_microbatches": 1
        },
        "validators": {
          "in_set_count": 10,
          "pattern_count": 10
        },
        "generate": {
          "num_records": 862,
          "max_invalid": null
        },
        "privacy_filters": {
          "outliers": "medium",
          "similarity": null
        },
        "task": {
          "type": "seed",
          "attrs": {
     

## Train the synthetic models
In this step, we will task the worker running in the Gretel cloud, or locally, to train a synthetic model on each source dataset.

In [5]:
# Define a function to submit a new model for a specific genome batch dataset

def create_model(batch_num):
    seconds = int(time.time())
    project_name = "Training genomes" + str(seconds)
    project = create_project(display_name=project_name)
    trainpath = batches[batch_num]
    model = project.create_model_obj(model_config=config)
    model.data_source = trainpath
    model.submit(upload_data_source=True)  
    return(model)

In [6]:
# Submit all the genome batches to train in parallel; poll for completion

from gretel_client.helpers import poll
from gretel_client import create_project
import time

# Create a model for each batch
models = []
for i in range(2):
    model = create_model(i)
    models.append(model)

# Poll for completion. Resubmit errors.
training = True
while training:
    time.sleep(60)
    training = False
    print()
    for i in range(2):
        model = models[i]
        model._poll_job_endpoint()
        status = model.__dict__['_data']['model']['status']
        print("Batch " + str(i) + " has status: " + status)
        if ((status == "active") or (status == "pending")):
            training = True
        if status == "error":
            model = create_model(i)
            models[i] = model
            training = True           

# Now that models are complete, get each batches Synthetic Quality Score (SQS)            
batch = 0
print()
for model in models:
    model._poll_job_endpoint()
    status = model.__dict__['_data']['model']['status']
    if status == "error":
        print("Batch " + str(batch) + " ended with error")
    else:
        report = model.peek_report()
        sqs = report['synthetic_data_quality_score']['score']
        label = "Moderate"
        if sqs >= 80:
            label = "Excellent"
        elif sqs >= 60:
            label = "Good"
        print("Batch " + str(batch) + " completes with SQS: " + label + " (" + str(sqs) + ")")
    batch += 1


Batch 0 has status: active
Batch 1 has status: active

Batch 0 has status: active
Batch 1 has status: active

Batch 0 has status: active
Batch 1 has status: active

Batch 0 has status: active
Batch 1 has status: active

Batch 0 has status: active
Batch 1 has status: active

Batch 0 has status: active
Batch 1 has status: active

Batch 0 has status: active
Batch 1 has status: active

Batch 0 has status: active
Batch 1 has status: active

Batch 0 has status: active
Batch 1 has status: active

Batch 0 has status: active
Batch 1 has status: active

Batch 0 has status: active
Batch 1 has status: active

Batch 0 has status: completed
Batch 1 has status: completed

Batch 0 completes with SQS: Excellent (89)
Batch 1 completes with SQS: Excellent (80)


# View the synthetic data quality reports

In [7]:
# Generate report that shows the statistical performance between the training and synthetic data

from smart_open import open
from IPython.core.display import display, HTML


# Change batch_num to any value between 0 and 1 to view performance report for other batches
batch_num = 0
display(HTML(data=open(models[batch_num].get_artifact_link("report")).read(), metadata=dict(isolated=True)))

0,1,2,3,4,5
Synthetic Data Use Cases,Excellent,Good,Moderate,Poor,Very Poor
Significant tuning required to improve model,,,,,
Improve your model using our tips and advice,,,,,
Demo environments or mock data,,,,,
Pre-production testing environments,,,,,
Balance or augment machine learning data sources,,,,,
Machine learning or statistical analysis,,,,,

0,1,2,3,4
Data Sharing Use Case,Excellent,Very Good,Good,Normal
"Internally, within the same team",,,,
"Internally, across different teams",,,,
"Externally, with trusted partners",,,,
"Externally, public availability",,,,

Unnamed: 0,Training Data,Synthetic Data
Row Count,25860,862
Column Count,19,19
Training Lines Duplicated,--,303

Default Privacy Protections,Advanced Protections

Field,Unique,Missing,Ave. Length,Type,Distribution Stability
rs255791755,3,0,1.0,Categorical,Excellent
rs27052855,3,0,1.0,Categorical,Excellent
rs29464487,3,0,1.0,Categorical,Excellent
rs6258876,3,0,1.0,Categorical,Excellent
rs29395706,3,0,1.0,Categorical,Excellent
rs29467625,3,0,1.0,Categorical,Excellent
rs49725879,3,0,1.0,Categorical,Excellent
rs27037903,3,0,1.0,Categorical,Excellent
rs257710525,3,0,1.0,Categorical,Excellent
rs49153109,3,0,1.0,Categorical,Excellent


## Generate synthetic genome data using seed values from the synthetic phenome data
This ensures that the new synthetic genome data aligns one to one with the synthetic phenome data

In [7]:
seedfile = str(data_path / 'phenome_abBMD_seeds.csv')
seed_df = pd.read_csv(seedfile)

rh = models[0].create_record_handler_obj(data_source=seedfile, params={"num_records": len(seed_df)})
rh.submit_cloud()

poll(rh)

synthetic_genomes_highassoc = pd.read_csv(rh.get_artifact_link("data"), compression='gzip')
synthetic_genomes_highassoc

[32mINFO: [0mStarting poller


{
    "uid": "6171e7dcad5bc28095b1cd94",
    "model_name": null,
    "runner_mode": "cloud",
    "user_id": "5f45aedbbff62139017abfeb",
    "project_id": "6171dc94a93661040a70cad8",
    "status_history": {
        "created": "2021-10-21T22:21:15.952000Z"
    },
    "last_modified": "2021-10-21T22:21:16.044000Z",
    "status": "created",
    "last_active_hb": null,
    "duration_minutes": null,
    "error_msg": null,
    "error_id": null,
    "traceback": null,
    "container_image": "074762682575.dkr.ecr.us-west-2.amazonaws.com/gretelai/synthetics@sha256:5188b73e1fc582fde1b3d77cac52d03a5e26a7bcc59b68e52fd04f4a8501b7d0",
    "model_id": "6171dc9d5979f4038dee2b95",
    "action": "generate",
    "config": {
        "data_source": "gretel_6850e2f4292042a98fcf11f8730570e0_phenome_abBMD_seeds.csv",
        "params": {
            "num_records": 5000,
            "max_invalid": 10000
        }
    }
}


[32mINFO: [0mStatus is pending. A Gretel Cloud worker is being allocated to begin generating synthetic records.
[32mINFO: [0mStatus is active. A worker has started!
2021-10-21T22:21:31.296344Z  Loading model to worker
2021-10-21T22:21:31.297170Z  Attempting to load model from Gretel Cloud
2021-10-21T22:21:33.805132Z  Checking for synthetic smart seeds
2021-10-21T22:21:34.103789Z  Loaded 5000 smart seeds for generation
2021-10-21T22:21:34.104596Z  Loading model
2021-10-21T22:21:36.405866Z  Generating records
{
    "num_records": 5000
}
2021-10-21T22:21:41.419513Z  Generation in progress
{
    "current_valid_count": 43,
    "current_invalid_count": 3,
    "new_valid_count": 43,
    "new_invalid_count": 3,
    "completion_percent": 0.86
}
2021-10-21T22:21:46.427130Z  Generation in progress
{
    "current_valid_count": 98,
    "current_invalid_count": 12,
    "new_valid_count": 55,
    "new_invalid_count": 9,
    "completion_percent": 1.96
}
2021-10-21T22:21:51.433321Z  Generation in p

2021-10-21T22:24:46.628079Z  Generation in progress
{
    "current_valid_count": 2220,
    "current_invalid_count": 203,
    "new_valid_count": 61,
    "new_invalid_count": 4,
    "completion_percent": 44.4
}
2021-10-21T22:24:51.633988Z  Generation in progress
{
    "current_valid_count": 2283,
    "current_invalid_count": 206,
    "new_valid_count": 63,
    "new_invalid_count": 3,
    "completion_percent": 45.66
}
2021-10-21T22:24:56.641351Z  Generation in progress
{
    "current_valid_count": 2346,
    "current_invalid_count": 209,
    "new_valid_count": 63,
    "new_invalid_count": 3,
    "completion_percent": 46.92
}
2021-10-21T22:25:01.646512Z  Generation in progress
{
    "current_valid_count": 2411,
    "current_invalid_count": 211,
    "new_valid_count": 65,
    "new_invalid_count": 2,
    "completion_percent": 48.22
}
2021-10-21T22:25:06.652197Z  Generation in progress
{
    "current_valid_count": 2472,
    "current_invalid_count": 215,
    "new_valid_count": 61,
    "new_inva

2021-10-21T22:28:06.851175Z  Generation in progress
{
    "current_valid_count": 4619,
    "current_invalid_count": 393,
    "new_valid_count": 61,
    "new_invalid_count": 2,
    "completion_percent": 92.38
}
2021-10-21T22:28:11.857244Z  Generation in progress
{
    "current_valid_count": 4675,
    "current_invalid_count": 401,
    "new_valid_count": 56,
    "new_invalid_count": 8,
    "completion_percent": 93.5
}
2021-10-21T22:28:16.863191Z  Generation in progress
{
    "current_valid_count": 4730,
    "current_invalid_count": 408,
    "new_valid_count": 55,
    "new_invalid_count": 7,
    "completion_percent": 94.6
}
2021-10-21T22:28:21.869299Z  Generation in progress
{
    "current_valid_count": 4789,
    "current_invalid_count": 413,
    "new_valid_count": 59,
    "new_invalid_count": 5,
    "completion_percent": 95.78
}
2021-10-21T22:28:26.875863Z  Generation in progress
{
    "current_valid_count": 4848,
    "current_invalid_count": 416,
    "new_valid_count": 59,
    "new_inval

Unnamed: 0,rs27052855,rs257710525,rs6258876,rs49153109,rs29395706,rs49725879,rs6284806,rs27052698,rs255791755,rs49072129,rs29467625,rs27037903,rs50536616,rs240744127,rs27037855,rs27037853,rs29464487,abBMD,SW16
0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0.0,0.0
1,0,1,1,1,1,1,1,0,1,1,1,0,1,1,1,1,0,0.0,0.0
2,0,1,1,1,1,1,1,1,0,1,1,0,1,1,1,1,1,0.0,0.0
3,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0.0,0.0
4,1,1,2,2,2,2,1,1,0,1,1,1,2,2,2,2,1,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,2,1,1,2,2,2,2,2,0,1,2,1,2,2,2,2,1,0.0,0.0
4996,2,2,2,2,2,2,2,1,0,1,2,1,2,2,2,2,1,0.0,0.0
4997,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0.0,0.0
4998,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1,1,1,0.0,0.0


In [8]:
# Drop the phenome information from the genome synth data and add back in the fields "id" and "discard"

id_col = []
discard_col = []
for i in range(len(synthetic_genomes_highassoc.index)):
    id_col.append(i)
    discard_col.append("no")

synthetic_genomes_highassoc = synthetic_genomes_highassoc.drop(['abBMD', 'SW16'], axis=1)
columns = ['id', 'discard']
columns = columns + list(synthetic_genomes_highassoc.columns)   
synthetic_genomes_highassoc["id"] = id_col
synthetic_genomes_highassoc["discard"] = discard_col
synthetic_genomes_highassoc = synthetic_genomes_highassoc.filter(columns)

In [9]:
synthetic_genomes_highassoc

Unnamed: 0,id,discard,rs27052855,rs257710525,rs6258876,rs49153109,rs29395706,rs49725879,rs6284806,rs27052698,rs255791755,rs49072129,rs29467625,rs27037903,rs50536616,rs240744127,rs27037855,rs27037853,rs29464487
0,0,no,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1
1,1,no,0,1,1,1,1,1,1,0,1,1,1,0,1,1,1,1,0
2,2,no,0,1,1,1,1,1,1,1,0,1,1,0,1,1,1,1,1
3,3,no,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,1
4,4,no,1,1,2,2,2,2,1,1,0,1,1,1,2,2,2,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,4995,no,2,1,1,2,2,2,2,2,0,1,2,1,2,2,2,2,1
4996,4996,no,2,2,2,2,2,2,2,1,0,1,2,1,2,2,2,2,1
4997,4997,no,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1
4998,4998,no,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1,1,1


In [10]:
len(seed_df)

5000

In [11]:
# Now repeat the process of using the phenome seeds to generate genome data for the low 
# association batch

rh = models[1].create_record_handler_obj(data_source=seedfile, params={"num_records": len(seed_df)})
rh.submit_cloud()

poll(rh)

synthetic_genomes_lowassoc = pd.read_csv(rh.get_artifact_link("data"), compression='gzip')
synthetic_genomes_lowassoc

[32mINFO: [0mStarting poller


{
    "uid": "6171ea155d986e5d8c7ac565",
    "model_name": null,
    "runner_mode": "cloud",
    "user_id": "5f45aedbbff62139017abfeb",
    "project_id": "6171dc9ea93661040a70cad9",
    "status_history": {
        "created": "2021-10-21T22:30:44.044000Z"
    },
    "last_modified": "2021-10-21T22:30:45.273000Z",
    "status": "created",
    "last_active_hb": null,
    "duration_minutes": null,
    "error_msg": null,
    "error_id": null,
    "traceback": null,
    "container_image": "074762682575.dkr.ecr.us-west-2.amazonaws.com/gretelai/synthetics@sha256:5188b73e1fc582fde1b3d77cac52d03a5e26a7bcc59b68e52fd04f4a8501b7d0",
    "model_id": "6171dc9f5979f4038dee2b96",
    "action": "generate",
    "config": {
        "data_source": "gretel_b8038b59621d435b844dd5b4105e5404_phenome_abBMD_seeds.csv",
        "params": {
            "num_records": 5000,
            "max_invalid": 10000
        }
    }
}


[32mINFO: [0mStatus is pending. A Gretel Cloud worker is being allocated to begin generating synthetic records.
[32mINFO: [0mStatus is active. A worker has started!
2021-10-21T22:30:58.543693Z  Loading model to worker
2021-10-21T22:30:58.544510Z  Attempting to load model from Gretel Cloud
2021-10-21T22:31:00.606417Z  Checking for synthetic smart seeds
2021-10-21T22:31:00.728096Z  Loaded 5000 smart seeds for generation
2021-10-21T22:31:00.728893Z  Loading model
2021-10-21T22:31:03.068556Z  Generating records
{
    "num_records": 5000
}
2021-10-21T22:31:08.084059Z  Generation in progress
{
    "current_valid_count": 38,
    "current_invalid_count": 9,
    "new_valid_count": 38,
    "new_invalid_count": 9,
    "completion_percent": 0.76
}
2021-10-21T22:31:13.090289Z  Generation in progress
{
    "current_valid_count": 99,
    "current_invalid_count": 13,
    "new_valid_count": 61,
    "new_invalid_count": 4,
    "completion_percent": 1.98
}
2021-10-21T22:31:18.095568Z  Generation in p

2021-10-21T22:34:12.294026Z  Generation in progress
{
    "current_valid_count": 2177,
    "current_invalid_count": 294,
    "new_valid_count": 55,
    "new_invalid_count": 9,
    "completion_percent": 43.54
}
2021-10-21T22:34:17.300534Z  Generation in progress
{
    "current_valid_count": 2235,
    "current_invalid_count": 301,
    "new_valid_count": 58,
    "new_invalid_count": 7,
    "completion_percent": 44.7
}
2021-10-21T22:34:22.306600Z  Generation in progress
{
    "current_valid_count": 2293,
    "current_invalid_count": 311,
    "new_valid_count": 58,
    "new_invalid_count": 10,
    "completion_percent": 45.86
}
2021-10-21T22:34:27.312392Z  Generation in progress
{
    "current_valid_count": 2354,
    "current_invalid_count": 317,
    "new_valid_count": 61,
    "new_invalid_count": 6,
    "completion_percent": 47.08
}
2021-10-21T22:34:32.318176Z  Generation in progress
{
    "current_valid_count": 2410,
    "current_invalid_count": 327,
    "new_valid_count": 56,
    "new_inv

2021-10-21T22:37:32.527273Z  Generation in progress
{
    "current_valid_count": 4530,
    "current_invalid_count": 591,
    "new_valid_count": 56,
    "new_invalid_count": 11,
    "completion_percent": 90.6
}
2021-10-21T22:37:37.533136Z  Generation in progress
{
    "current_valid_count": 4592,
    "current_invalid_count": 595,
    "new_valid_count": 62,
    "new_invalid_count": 4,
    "completion_percent": 91.84
}
2021-10-21T22:37:42.539378Z  Generation in progress
{
    "current_valid_count": 4654,
    "current_invalid_count": 601,
    "new_valid_count": 62,
    "new_invalid_count": 6,
    "completion_percent": 93.08
}
2021-10-21T22:37:47.545479Z  Generation in progress
{
    "current_valid_count": 4716,
    "current_invalid_count": 606,
    "new_valid_count": 62,
    "new_invalid_count": 5,
    "completion_percent": 94.32
}
2021-10-21T22:37:52.552284Z  Generation in progress
{
    "current_valid_count": 4770,
    "current_invalid_count": 618,
    "new_valid_count": 54,
    "new_inv

Unnamed: 0,rs29873068,rs257562009,cfw-13-24187180,rs235329625,rs263819763,rs29880124,rs248411793,rs259769137,rs29885281,rs256615171,rs230595969,rs225065578,rs29880636,rs216070209,rs232346591,rs249497227,rs29883093,abBMD,SW16
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0
4996,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0
4997,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0
4998,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0


In [12]:
# Drop the phenome information from the genome synth data and add back in the fields "id" and "discard"

id_col = []
discard_col = []
for i in range(len(synthetic_genomes_lowassoc.index)):
    id_col.append(i)
    discard_col.append("no")

synthetic_genomes_lowassoc = synthetic_genomes_lowassoc.drop(['abBMD', 'SW16'], axis=1)
columns = ['id', 'discard']
columns = columns + list(synthetic_genomes_lowassoc.columns)   
synthetic_genomes_lowassoc["id"] = id_col
synthetic_genomes_lowassoc["discard"] = discard_col
synthetic_genomes_lowassoc = synthetic_genomes_lowassoc.filter(columns)

In [13]:
synthetic_genomes_lowassoc

Unnamed: 0,id,discard,rs29873068,rs257562009,cfw-13-24187180,rs235329625,rs263819763,rs29880124,rs248411793,rs259769137,rs29885281,rs256615171,rs230595969,rs225065578,rs29880636,rs216070209,rs232346591,rs249497227,rs29883093
0,0,no,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,no,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2,no,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,3,no,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,4,no,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,4995,no,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4996,4996,no,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4997,4997,no,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4998,4998,no,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Save the synthetically generated genomes

In [14]:
synthetic_genomes_highassoc.to_csv(data_path / 'synthetic_genomes_highassoc.txt', index=False, sep=' ')

In [15]:
synthetic_genomes_lowassoc.to_csv(data_path / 'synthetic_genomes_lowassoc.txt', index=False, sep=' ')