# Build the mouse genome training dataset
This notebook creates a mouse genome training datasets by batching the genome data by position on the chromosome. You can pick the quantity and characteristics of the batches you'd like to create training 
datasets for. Your choices are written to a file that the next notebook, 04_create_synthetic_mouse_genomes, reads from.

In [1]:
import os
import pathlib
import pandas as pd

base_path = pathlib.Path(os.getcwd().replace("/synthetics", ""))
data_path = base_path / 'mice_data_set' / 'data' 
experiment_path = base_path / 'mice_data_set' / 'out' 

In [2]:
# Read in the geno data and remove the discards (slow)

import pandas as pd


genofile = data_path / "geno.txt"
geno = pd.read_csv(genofile, sep=' ')
geno = geno[geno["discard"] == "no"]
geno.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,id,discard,cfw-1-3082859,cfw-1-3207478,cfw-1-3284999,cfw-1-4056451,rs241840178,cfw-1-4592184,rs214108183,rs31954814,...,rs239202862,cfw-19-60773695,rs212272420,rs51223003,rs30736750,rs30654044,rs30990073,rs50978457,rs51755773,cfw-19-61107432
0,26305,no,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,2.0,2.0,2.0,0.0,0.0,2.0,2.0,2.0,0.0,0.0
1,26306,no,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.929,1.98,1.999,0.0,0.0,2.0,1.964,1.689,1.781,0.254
2,26307,no,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,1.933,1.603,1.993,0.0,0.0,2.0,1.492,1.793,1.538,0.526
3,26308,no,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.776,...,2.0,2.0,2.0,0.0,0.0,2.0,2.0,2.0,1.0,0.0
4,26309,no,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,2.0,2.0,0.0,0.0,2.0,2.0,2.0,2.0,0.361


In [3]:
# Read in the muscle and bone phenome data, which contains the abBMD phenome and its covariate

phenofile = data_path / "pheno_batch0_withID.csv"
pheno = pd.read_csv(phenofile)
pheno.head()

Unnamed: 0,TA,SW16,tibia,EDL,plantaris,gastroc,SW6,sacweight,BMD,abBMD,id
0,74.6,0.0,19.05,16.9,23.2,194.8,0.0,46.6,1.92,0.0,26305
1,62.3,0.0,18.06,13.2,18.8,154.6,0.0,35.7,1.88,0.0,26306
2,54.1,0.0,18.14,11.2,17.6,143.6,0.0,34.1,1.89,0.0,26307
3,56.5,0.0,18.18,12.9,17.7,148.6,0.0,41.8,1.95,0.0,26308
4,64.0,0.0,18.35,14.1,20.6,157.4,0.0,39.5,1.92,0.0,26309


In [4]:
# Grab the original GWAS linear model results for abBMD and the chromosome position information

gwasfile = experiment_path / "lm_abBMD_1_79646.csv"
gwas_scores = pd.read_csv(gwasfile)
gwas_scores.head()

Unnamed: 0.1,Unnamed: 0,snp,chr,pos,p
0,1,rs29477109,11,95292217,5.052317e-14
1,2,rs27071351,11,96114911,7.074181e-14
2,3,rs27024162,11,96918116,7.170582e-14
3,4,rs49423067,11,96918212,7.198661e-14
4,5,rs29470802,11,95263588,8.049849e-14


In [5]:
# Sort by position on the chromosome

gwas_scores_sorted = gwas_scores.sort_values(by=['chr','pos']).reset_index()
gwas_scores_sorted.head()

Unnamed: 0.1,index,Unnamed: 0,snp,chr,pos,p
0,73583,73584,cfw-1-3207478,1,3207478,0.921913
1,40919,40920,cfw-1-4592184,1,4592184,0.496735
2,29303,29304,rs31954814,1,5151352,0.353184
3,44285,44286,rs31947195,1,5240999,0.540256
4,40335,40336,rs30660852,1,5241015,0.489395


In [6]:
# Group the SNPs into batches of 17 (saving room to use abBMD and SW16 as seeds)

interesting_threshold = 5e-8

batches = {}
batch_num = -1
max_snps_per_batch = 17
snp_cnt = 0
last_chromo = -1
batch_psum = {}
batch_avg_pvalue = {}
all_avg_pvalues = []
all_int_cnt = []
all_not_int_cnt = []
all_batch_nums = []

for i in range(len(gwas_scores_sorted)):
    pscore = gwas_scores_sorted.loc[i]['p']
    chromo = gwas_scores_sorted.loc[i]['chr']
    pos = gwas_scores_sorted.loc[i]['pos'] 
    snp = gwas_scores_sorted.loc[i]['snp'] 
    if ((snp_cnt == max_snps_per_batch) or (chromo != last_chromo)):
        batch_num += 1
        batches[batch_num] = {}
        batches[batch_num]['chr_pos'] = []
        batches[batch_num]['pvalues'] = []
        batches[batch_num]['snps'] = []
        batches[batch_num]['snp_cnt'] = 0
        batches[batch_num]['interesting_cnt'] = 0
        batches[batch_num]['not_interesting_cnt'] = 0       
        batch_psum[batch_num] = 0
        snp_cnt = 0
    batches[batch_num]['chr_pos'].append(str(chromo) + "_" + str(pos))
    batches[batch_num]['pvalues'].append(pscore)
    batches[batch_num]['snps'].append(snp)
    batches[batch_num]['snp_cnt'] += 1
    if pscore <= interesting_threshold:
        batches[batch_num]['interesting_cnt'] += 1
    else:
        batches[batch_num]['not_interesting_cnt'] += 1
    batch_psum[batch_num] += pscore
    last_chromo = chromo
    snp_cnt += 1
    
# Compute the average pvalue per batch
for i, next_batch in enumerate(batches):
    batch_avg_pvalue[next_batch] = batch_psum[next_batch] / batches[next_batch]['snp_cnt']
    all_avg_pvalues.append(batch_avg_pvalue[next_batch])
    all_int_cnt.append(batches[next_batch]['interesting_cnt'])
    all_not_int_cnt.append(batches[next_batch]['not_interesting_cnt'])
    all_batch_nums.append(i)
    
pvalue_df = pd.DataFrame({"batch": all_batch_nums, "avg_pvalue": all_avg_pvalues, "interesting_cnt": all_int_cnt, "not_interesting_cnt": all_not_int_cnt})
     

In [7]:
# Create function to build a training set from a batch

def build_training(batch):
    
    training_min_rows = 25000
    
    # Gather the SNPs
    grp_columns = list(batches[batch]["snps"])
    grp_columns.append("id")
    geno_grp = geno.filter(grp_columns)

    # Cast float values to integers
    floats = geno_grp.select_dtypes(include=['float64']) 
    for col in floats.columns.values:
        geno_grp[col] = geno_grp[col].astype('int64')

    # Add in the phenome information to the genome training set
    genome_phenome = geno_grp.join(pheno.set_index('id'), on = "id", how = "inner")
    columns_use = list(batches[batch]["snps"])
    columns_use.append("abBMD")
    columns_use.append("SW16")
    genome_train = genome_phenome.filter(columns_use)
    
    # Replicate training set to have a minimum of 25000 examples
    dataset_rows = len(genome_train)
    genome_train = pd.concat([genome_train] * (training_min_rows // dataset_rows + 1))
    
    # Save the training file
    filename = "geno_abBMD_batch" + str(batch) + "_train.csv"
    genofile = data_path / "genome_training_data" / filename
    genome_train.to_csv(genofile, index=False, header=True)
    
    # Now create of version of map.txt with just the SNPs in the training set of this first batch
    mapfile = data_path / "map.txt"
    mapdata = pd.read_csv(mapfile, sep=' ')
    mapdata_use = mapdata[mapdata["id"].isin(batches[batch]["snps"])]
    filename = "map_abBMD_batch" + str(batch) + ".txt"
    mapfile_new = data_path / "genome_map_data" / filename
    mapdata_use.to_csv(mapfile_new, sep=' ', header=True, index=False)
    

In [8]:
# How many batches have interesting pvalues
len(pvalue_df[pvalue_df["interesting_cnt"] > 0])

40

In [27]:
# Create training sets for some or all of the batches with interesting pvalues

interesting_batches_to_use = 40
all_batches_used = []
batches_to_use = list(pvalue_df[pvalue_df["interesting_cnt"] > 0]["batch"].sample(n=interesting_batches_to_use))
for batch in batches_to_use:
    #build_training(batch)
    all_batches_used.append(batch)

In [10]:
# How many batches have nothing interesting
len(pvalue_df[pvalue_df["interesting_cnt"] == 0])

4654

In [28]:
# Create training sets for some or all of the batches with no interesting pvalues

non_interesting_batches_to_use = 500
batches_to_use = list(pvalue_df[pvalue_df["interesting_cnt"] == 0]["batch"].sample(n=non_interesting_batches_to_use))
for batch in batches_to_use:
    #build_training(batch)
    all_batches_used.append(batch)

In [29]:
# Save to a file the list of batch numbers we created training sets for

file_df = pd.DataFrame({"batch": all_batches_used})
filename = data_path / "batch_training_list.csv"
file_df.to_csv(filename, index=False, header=True)

In [31]:
# Additionally, create a batch file list containing all batches

all_batches = list(pvalue_df["batch"])
for batch in all_batches:
    if batch not in all_batches_used:
        build_training(batch)


In [32]:
len(all_batches)

4694

In [33]:
# Save to a file this list of all batches

file_df = pd.DataFrame({"batch": all_batches})
filename = data_path / "batch_training_list_all.csv"
file_df.to_csv(filename, index=False, header=True)