In [1]:
import pandas as pd
import os
import shutil
import numpy as np
import glob

from QC.utils import shell_do
from QC.imputation import impute_data_prep
import QC.config as config

In [2]:
idat_path = '/data/CARD/PD/GP2/raw_genotypes/GP2_idats'
raw_plink_path = '/data/CARD/PD/GP2/raw_genotypes/GP2_plink'
gp2_qc_path = '/data/CARD/PD/GP2/raw_genotypes/GP2_QC'

# now set up path for round1 QC
geno_path = f'{gp2_qc_path}/round1/GP2_QC_round1'

ilmn_files_path = '/data/CARD/PD/GP2/ilmn_files'
bpm = f'{ilmn_files_path}/NeuroBooster_20042459_A1.bpm'
egt = f'{ilmn_files_path}/NBSCluster_file_n1393_011921.egt'
iaap = f'{ilmn_files_path}/iaap-cli/iaap-cli'

key_dir = '/data/CARD/PD/GP2/key_files'
clin_dir = '/data/CARD/PD/GP2/clinical'
key_file = f'{key_dir}/master_key.csv'
clin_file = f'{clin_dir}/GP2_clinical.csv'

swarm_scripts_dir = f'/data/CARD/PD/GP2/swarm_scripts'
# swarm_dir = f'/data/CARD/PD/GP2/swarm_scripts/swarm'

# Create Plink style phenotypes and update ids

In [9]:
pheno = pd.read_csv(clin_file)
key = pd.read_csv(key_file)


pheno['Original_clinicalID'] = pheno['Original_clinicalID'].astype(str)
key['Sample_ID'] = key['Sample_ID'].astype(str)
key['filename'] = key['SentrixBarcode_A'].astype(str) + '_' + key['SentrixPosition_A']
pheno_out = key.merge(pheno, how='inner', left_on='Sample_ID', right_on='Original_clinicalID')
pheno_out['IID'] = pheno_out.SentrixBarcode_A.astype(str) + '_' + pheno_out.SentrixPosition_A.astype(str)
pheno_out['FID'] = 0
pheno_out['FID_new'] = 0
# pheno_out['pheno'] = 0


# update codes to plink format
pheno_out.loc[pheno_out.Phenotype == 'PD', 'pheno'] = 2
pheno_out.loc[pheno_out.Phenotype == 'Control', 'pheno'] = 1
pheno_out.loc[pheno_out.Phenotype == 'Not Reported', 'pheno'] = 0

pheno_out.loc[pheno_out.Sex == 'Male', 'sex'] = 1
pheno_out.loc[pheno_out.Sex == 'Female', 'sex'] = 2
pheno_out.loc[pheno_out.Sex == 'Not Reported', 'sex'] = 0

pheno_out.loc[:,'pheno'] = pheno_out.loc[:,'pheno'].astype(int).astype(str)
pheno_out.loc[:,'sex'] = pheno_out.loc[:,'sex'].astype(int).astype(str)


pheno_out[['FID','IID', 'FID_new', 'GP2SampleID']].to_csv(f'{clin_dir}/update_ids.txt', sep='\t', header=False, index=False)
pheno_out[['FID_new', 'GP2SampleID', 'pheno']].to_csv(f'{clin_dir}/update_pheno.txt', sep='\t', header=False, index=False)
pheno_out[['FID_new', 'GP2SampleID', 'sex']].to_csv(f'{clin_dir}/update_sex.txt', sep='\t', header=False, index=False)


In [17]:
pheno_out.sex.value_counts()

1    3054
2    2259
0       3
Name: sex, dtype: int64

# Convert idats to ped 

In [27]:
with open(f'{swarm_scripts_dir}/idat_to_ped.swarm', 'w') as f:
    
    for code in key.SentrixBarcode_A.unique():
        
        idat_to_ped_cmd = f'\
{iaap} gencall \
{bpm} \
{egt} \
{raw_plink_path}/ \
-f {idat_path}/{code} \
-p \
-t 8'
        
        f.write(f'{idat_to_ped_cmd}\n')
f.close()

In [29]:
!swarm -f {swarm_scripts_dir}/idat_to_ped.swarm -g 32 -t 16 --time=10:00:00 --logdir {swarm_scripts_dir}/logs --gres=lscratch:20 --partition=norm

20940619


In [32]:
# copy map file to match name of each ped
map_file = f'{raw_plink_path}/NeuroBooster_20042459_A1.map'
for filename in key.filename:
    ped = f'{raw_plink_path}/{filename}.ped'
    out_map = f'{raw_plink_path}/{filename}.map'
    if os.path.isfile(ped):
        shutil.copyfile(src=map_file, dst=out_map)
    else:
        print(f'{ped} does not exist!')
        print(f'{out_map} creation cancelled')

/data/CARD/PD/GP2/raw_genotypes/GP2_plink/205275450156_R02C01.ped does not exist!
/data/CARD/PD/GP2/raw_genotypes/GP2_plink/205275450156_R02C01.map creation cancelled


In [33]:
with open(f'{swarm_scripts_dir}/make_bed.swarm', 'w') as f:
    for filename in key.filename:
        ped = f'{raw_plink_path}/{filename}'
        make_bed_cmd = f'\
plink \
--file {ped} \
--make-bed \
--out {raw_plink_path}/{filename}'

        f.write(f'{make_bed_cmd}\n')
f.close()

In [34]:
!swarm -f {swarm_scripts_dir}/make_bed.swarm -g 64 -t 32 --time=10:00:00 --logdir swarm --gres=lscratch:20 --partition=norm

20951177


In [10]:
# write plink merge command
with open(f"{raw_plink_path}/merge_bed.list", 'w') as f:
    for filename in key.filename:
        bed = f'{raw_plink_path}/{filename}'
        if os.path.isfile(f'{bed}.bed'):
            f.write(f'{bed}\n')
        else:
            print(f'{bed} does not exist!!!')
f.close()

with open(f"{swarm_scripts_dir}/merge.swarm", 'w') as f:

    plink_merge_cmd = f'\
plink \
--merge-list {raw_plink_path}/merge_bed.list \
--update-ids {clin_dir}/update_ids.txt \
--make-bed \
--out {raw_plink_path}/GP2_merge'
    f.write(f"{plink_merge_cmd}")
f.close()

/data/CARD/PD/GP2/raw_genotypes/GP2_plink/205275450156_R02C01 does not exist!!!


In [53]:
!swarm -f {swarm_scripts_dir}/merge.swarm -g 64 -t 32 --time=10:00:00 --logdir swarm --gres=lscratch:20 --partition=norm

20969678


In [18]:
!plink --bfile {raw_plink_path}/GP2_merge --pheno {clin_dir}/update_pheno.txt --update-sex {clin_dir}/update_sex.txt --make-bed --out {geno_path}

PLINK v1.90b4.4 64-bit (21 May 2017)           www.cog-genomics.org/plink/1.9/
(C) 2005-2017 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to /data/CARD/PD/GP2/raw_genotypes/GP2_QC/round1/GP2_QC_round1.log.
Options in effect:
  --bfile /data/CARD/PD/GP2/raw_genotypes/GP2_plink/GP2_merge
  --make-bed
  --out /data/CARD/PD/GP2/raw_genotypes/GP2_QC/round1/GP2_QC_round1
  --pheno /data/CARD/PD/GP2/clinical/update_pheno.txt
  --update-sex /data/CARD/PD/GP2/clinical/update_sex.txt

1547809 MB RAM detected; reserving 773904 MB for main workspace.
2004347 variants loaded from .bim file.
5320 people (2988 males, 2180 females, 152 ambiguous) loaded from .fam.
Ambiguous sex IDs written to
/data/CARD/PD/GP2/raw_genotypes/GP2_QC/round1/GP2_QC_round1.nosex .
5312 phenotype values present after --pheno.
--update-sex: 5315 people updated, 1 ID not present.
phenotypes to be ignored, use the --allow-no-sex flag.
Using 1 thread (no multithreaded calculations invoked).
Before ma

In [1]:
# missing clinical information
# key[~key.filename.isin(pheno_out.filename)]

In [50]:
!cat {raw_plink_path}/merge_bed.list | wc -l

5320
