# Build the mouse genome training dataset
This notebook creates a mouse genome training dataset by batching the genome data by position on the chromosome. We then create a training set for the phenome abBMD using an example batch of genome data (SNPs).

In [9]:
import os
import pathlib
import pandas as pd

base_path = pathlib.Path(os.getcwd().replace("/synthetics", ""))
data_path = base_path / 'data'
experiment_path = base_path / 'out_orig' 

In [5]:
# Read in the geno data and remove the discards

import pandas as pd
genofile = data_path / "geno.txt"
geno = pd.read_csv(genofile, sep=' ')
geno = geno[geno["discard"] == "no"]
geno.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,id,discard,cfw-1-3082859,cfw-1-3207478,cfw-1-3284999,cfw-1-4056451,rs241840178,cfw-1-4592184,rs214108183,rs31954814,...,rs239202862,cfw-19-60773695,rs212272420,rs51223003,rs30736750,rs30654044,rs30990073,rs50978457,rs51755773,cfw-19-61107432
0,26305,no,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,2.0,2.0,2.0,0.0,0.0,2.0,2.0,2.0,0.0,0.0
1,26306,no,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.929,1.98,1.999,0.0,0.0,2.0,1.964,1.689,1.781,0.254
2,26307,no,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,1.933,1.603,1.993,0.0,0.0,2.0,1.492,1.793,1.538,0.526
3,26308,no,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.776,...,2.0,2.0,2.0,0.0,0.0,2.0,2.0,2.0,1.0,0.0
4,26309,no,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,2.0,2.0,0.0,0.0,2.0,2.0,2.0,2.0,0.361


In [18]:
# Read in the muscle and bone phenome data, which contains the abBMD phenome and its covariate

phenofile = data_path/ "pheno_batch0_withID.csv"
pheno = pd.read_csv(phenofile)
pheno.head()

Unnamed: 0,TA,SW16,tibia,EDL,plantaris,gastroc,SW6,sacweight,BMD,abBMD,id
0,74.6,0.0,19.05,16.9,23.2,194.8,0.0,46.6,1.92,0.0,26305
1,62.3,0.0,18.06,13.2,18.8,154.6,0.0,35.7,1.88,0.0,26306
2,54.1,0.0,18.14,11.2,17.6,143.6,0.0,34.1,1.89,0.0,26307
3,56.5,0.0,18.18,12.9,17.7,148.6,0.0,41.8,1.95,0.0,26308
4,64.0,0.0,18.35,14.1,20.6,157.4,0.0,39.5,1.92,0.0,26309


In [10]:
# Grab the original GWAS linear model results for abBMD and the chromosome position information

import pandas as pd
gwasfile = experiment_path / "lm_abBMD_1_79646.csv"
gwas_scores = pd.read_csv(gwasfile)
gwas_scores.head()

Unnamed: 0.1,Unnamed: 0,snp,chr,pos,p
0,1,rs29477109,11,95292217,5.052317e-14
1,2,rs27071351,11,96114911,7.074181e-14
2,3,rs27024162,11,96918116,7.170582e-14
3,4,rs49423067,11,96918212,7.198661e-14
4,5,rs29470802,11,95263588,8.049849e-14


In [11]:
# Sort by position on the chromosome

gwas_scores_sorted = gwas_scores.sort_values(by=['chr','pos']).reset_index()
gwas_scores_sorted.head()

Unnamed: 0.1,index,Unnamed: 0,snp,chr,pos,p
0,73583,73584,cfw-1-3207478,1,3207478,0.921913
1,40919,40920,cfw-1-4592184,1,4592184,0.496735
2,29303,29304,rs31954814,1,5151352,0.353184
3,44285,44286,rs31947195,1,5240999,0.540256
4,40335,40336,rs30660852,1,5241015,0.489395


In [12]:
# Group the SNPs into batches of 17 (saving room to use abBMD and SW16 as seeds)

batches = {}
batch_num = -1
max_snps_per_batch = 17
snp_cnt = 0
last_chromo = -1
batch_psum = {}

for i in range(len(gwas_scores_sorted)):
    pscore = gwas_scores_sorted.loc[i]['p']
    chromo = gwas_scores_sorted.loc[i]['chr']
    pos = gwas_scores_sorted.loc[i]['pos'] 
    snp = gwas_scores_sorted.loc[i]['snp'] 
    if ((snp_cnt == max_snps_per_batch) or (chromo != last_chromo)):
        batch_num += 1
        batches[batch_num] = {}
        batches[batch_num]['chr_pos'] = []
        batches[batch_num]['pvalues'] = []
        batches[batch_num]['snps'] = []
        batch_psum[batch_num] = 0
        snp_cnt = 0
    batches[batch_num]['chr_pos'].append(str(chromo) + "_" + str(pos))
    batches[batch_num]['pvalues'].append(pscore)
    batches[batch_num]['snps'].append(snp)
    batch_psum[batch_num] += pscore
    last_chromo = chromo
    snp_cnt += 1
     

In [13]:
#Which batch as the strongest associations with abBMD?
min(batch_psum.items(), key=lambda k: k[1])

(2996, 0.0004329560548238054)

In [14]:
# Genome batch 2996 does
batches[2996]

{'chr_pos': ['11_96682007',
  '11_96682022',
  '11_96682466',
  '11_96691808',
  '11_96717503',
  '11_96737037',
  '11_96737262',
  '11_96740154',
  '11_96740180',
  '11_96772332',
  '11_96772746',
  '11_96776217',
  '11_96816950',
  '11_96816976',
  '11_96816977',
  '11_96817194',
  '11_96820433'],
 'pvalues': [7.79099743016085e-10,
  5.59364680710276e-12,
  2.60293101335005e-10,
  2.51531187395961e-13,
  2.08605025783302e-12,
  3.01468028552801e-11,
  2.59793199141993e-13,
  1.65307194168037e-06,
  1.0197029621742702e-06,
  1.7836455671521602e-11,
  9.627958903863468e-10,
  8.719371386295509e-11,
  3.15362730226529e-13,
  3.12170156699098e-13,
  3.1804086347162795e-13,
  3.26757383049382e-13,
  0.00043028113309089103],
 'snps': ['rs27052855',
  'rs257710525',
  'rs6258876',
  'rs49153109',
  'rs29395706',
  'rs49725879',
  'rs6284806',
  'rs27052698',
  'rs255791755',
  'rs49072129',
  'rs29467625',
  'rs27037903',
  'rs50536616',
  'rs240744127',
  'rs27037855',
  'rs27037853',
  'r

In [15]:
# Build a training set for this batch with strong associations

grp1_columns = list(batches[2996]["snps"])
grp1_columns.append("id")
geno_grp1 = geno.filter(grp1_columns)

# Cast float values to integers
floats = geno_grp1.select_dtypes(include=['float64']) 
for col in floats.columns.values:
    geno_grp1[col] = geno_grp1[col].astype('int64')

geno_grp1.head()

Unnamed: 0,rs27052855,rs257710525,rs6258876,rs49153109,rs29395706,rs49725879,rs6284806,rs27052698,rs255791755,rs49072129,rs29467625,rs27037903,rs50536616,rs240744127,rs27037855,rs27037853,rs29464487,id
0,0,0,0,1,0,1,1,0,0,0,0,0,1,1,1,1,1,26305
1,1,1,1,1,1,1,1,1,0,0,0,0,1,1,1,1,1,26306
2,2,1,1,2,2,2,2,1,0,1,1,1,2,2,2,2,1,26307
3,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,26308
4,2,2,2,2,2,2,2,0,1,2,2,2,2,2,2,2,2,26309


In [44]:
pheno.head()

Unnamed: 0,TA,SW16,tibia,EDL,plantaris,gastroc,SW6,sacweight,BMD,abBMD,id
0,74.6,0.0,19.05,16.9,23.2,194.8,0.0,46.6,1.92,0.0,26305
1,62.3,0.0,18.06,13.2,18.8,154.6,0.0,35.7,1.88,0.0,26306
2,54.1,0.0,18.14,11.2,17.6,143.6,0.0,34.1,1.89,0.0,26307
3,56.5,0.0,18.18,12.9,17.7,148.6,0.0,41.8,1.95,0.0,26308
4,64.0,0.0,18.35,14.1,20.6,157.4,0.0,39.5,1.92,0.0,26309


In [50]:
geno_grp1.columns

Index(['rs27052855', 'rs257710525', 'rs6258876', 'rs49153109', 'rs29395706',
       'rs49725879', 'rs6284806', 'rs27052698', 'rs255791755', 'rs49072129',
       'rs29467625', 'rs27037903', 'rs50536616', 'rs240744127', 'rs27037855',
       'rs27037853', 'rs29464487', 'id'],
      dtype='object')

In [19]:
# Add in the phenome information to the genome training set

#genome_phenome = geno_grp1.join(pheno.set_index('id'), on='id', how='inner', lsuffix='_genome', rsuffix='_phenome')
genome_phenome = geno_grp1.join(pheno.set_index('id'), on = "id", how = "inner")

columns_use = list(batches[2996]["snps"])
columns_use.append("abBMD")
columns_use.append("SW16")

genome_train = genome_phenome.filter(columns_use)

genome_train

Unnamed: 0,rs27052855,rs257710525,rs6258876,rs49153109,rs29395706,rs49725879,rs6284806,rs27052698,rs255791755,rs49072129,rs29467625,rs27037903,rs50536616,rs240744127,rs27037855,rs27037853,rs29464487,abBMD,SW16
0,0,0,0,1,0,1,1,0,0,0,0,0,1,1,1,1,1,0.0,0.0
1,1,1,1,1,1,1,1,1,0,0,0,0,1,1,1,1,1,0.0,0.0
2,2,1,1,2,2,2,2,1,0,1,1,1,2,2,2,2,1,0.0,0.0
3,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,0.0,0.0
4,2,2,2,2,2,2,2,0,1,2,2,2,2,2,2,2,2,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1155,1,1,0,1,1,1,1,0,0,0,0,1,1,1,1,1,2,0.0,0.0
1156,0,0,0,1,1,1,1,0,0,0,1,0,1,1,1,1,1,0.0,0.0
1158,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.0,0.0
1159,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0.0,0.0


In [20]:
# Save the training file

genofile = data_path / "geno_abBMD_train.csv"
genome_train.to_csv(genofile, index=False, header=True)