# Build the mouse genome training dataset
This notebook creates a mouse genome training dataset by batching the genome data by position on the chromosome. We then create a training set for the phenome abBMD using an example batch of genome data (SNPs).

In [None]:
import os
import pathlib
import pandas as pd

base_path = pathlib.Path(os.getcwd().replace("/synthetics", ""))
data_path = base_path / 'mice_data_set' / 'data' 
experiment_path = base_path / 'mice_data_set' / 'out' 

In [None]:
# Read in the geno data and remove the discards (slow)

import pandas as pd


genofile = data_path / "geno.txt"
geno = pd.read_csv(genofile, sep=' ')
geno = geno[geno["discard"] == "no"]
geno.head()

In [None]:
# Read in the muscle and bone phenome data, which contains the abBMD phenome and its covariate

phenofile = data_path / "pheno_batch0_withID.csv"
pheno = pd.read_csv(phenofile)
pheno.head()

In [None]:
# Grab the original GWAS linear model results for abBMD and the chromosome position information

gwasfile = experiment_path / "lm_abBMD_1_79646.csv"
gwas_scores = pd.read_csv(gwasfile)
gwas_scores.head()

In [None]:
# Sort by position on the chromosome

gwas_scores_sorted = gwas_scores.sort_values(by=['chr','pos']).reset_index()
gwas_scores_sorted.head()

In [None]:
# Group the SNPs into batches of 17 (saving room to use abBMD and SW16 as seeds)

batches = {}
batch_num = -1
max_snps_per_batch = 17
snp_cnt = 0
last_chromo = -1
batch_psum = {}

for i in range(len(gwas_scores_sorted)):
    pscore = gwas_scores_sorted.loc[i]['p']
    chromo = gwas_scores_sorted.loc[i]['chr']
    pos = gwas_scores_sorted.loc[i]['pos'] 
    snp = gwas_scores_sorted.loc[i]['snp'] 
    if ((snp_cnt == max_snps_per_batch) or (chromo != last_chromo)):
        batch_num += 1
        batches[batch_num] = {}
        batches[batch_num]['chr_pos'] = []
        batches[batch_num]['pvalues'] = []
        batches[batch_num]['snps'] = []
        batch_psum[batch_num] = 0
        snp_cnt = 0
    batches[batch_num]['chr_pos'].append(str(chromo) + "_" + str(pos))
    batches[batch_num]['pvalues'].append(pscore)
    batches[batch_num]['snps'].append(snp)
    batch_psum[batch_num] += pscore
    last_chromo = chromo
    snp_cnt += 1
     

In [None]:
# Which batch as the strongest associations with abBMD?
min(batch_psum.items(), key=lambda k: k[1])

In [None]:
# Genome batch 2996 does
batches[2996]

In [None]:
# Build a training set for this batch with strong associations

grp1_columns = list(batches[2996]["snps"])
grp1_columns.append("id")
geno_grp1 = geno.filter(grp1_columns)

# Cast float values to integers
floats = geno_grp1.select_dtypes(include=['float64']) 
for col in floats.columns.values:
    geno_grp1[col] = geno_grp1[col].astype('int64')

geno_grp1.head()

In [None]:
pheno.head()

In [None]:
geno_grp1.columns

In [None]:
# Add in the phenome information to the genome training set

genome_phenome = geno_grp1.join(pheno.set_index('id'), on = "id", how = "inner")

columns_use = list(batches[2996]["snps"])
columns_use.append("abBMD")
columns_use.append("SW16")

genome_train = genome_phenome.filter(columns_use)

genome_train

In [None]:
# Save the training file

genofile = data_path / "geno_abBMD_train.csv"
genome_train.to_csv(genofile, index=False, header=True)

In [None]:
# Now create of version of map.txt with just the SNPs in the training set of this first batch

mapfile = data_path / "map.txt"
mapdata = pd.read_csv(mapfile, sep=' ')
mapdata_use = mapdata[mapdata["id"].isin(batches[2996]["snps"])]
mapfile_new = data_path / "map_abBMD.txt"
mapdata_use.to_csv(mapfile_new, sep=' ', header=True, index=False)