In [1]:
import pandas as pd
import os
import shutil
import numpy as np
import glob

from QC.utils import shell_do
from QC.imputation import impute_data_prep
import QC.config as config

In [2]:
# function definitions

def make_gencall_cmds(idat_path, bpm_file, cluster_file, out_path, iaap=None):
    
    cmd = f'\
{iaap} gencall \
{bpm_path} \
{cluster_file} \
{out_path}/ped/ \
-f {idat_path}/{code} \
-p \
-t 8'

In [3]:
basedir = '/data/CARD/PD/GP2/raw_genotypes'
out_genotypes = '/data/CARD/PD/GP2/genotypes'
cohort_path = f'{basedir}/shulman_ny'
sample_info_path = f'{cohort_path}/sample_info'
gtc_file_path = f'{cohort_path}/GP2_GCT_files'
raw_idat_file_path = f'{cohort_path}/GP2_Shulman'
ped_dir = f'{cohort_path}/ped'
plink_dir = f'{cohort_path}/plink'
swarm_scripts_dir = f'{cohort_path}/swarm_scripts'
idat_dir = f'{cohort_path}/idats'

# create ped and plink directories for raw geno outputs if they don't exist
os.makedirs(ped_dir, exist_ok=True)
os.makedirs(plink_dir, exist_ok=True)
os.makedirs(f'{plink_dir}/indiv_samples', exist_ok=True)
os.makedirs(swarm_scripts_dir, exist_ok=True)
os.makedirs(idat_dir, exist_ok=True)

# files
key_file = f'{sample_info_path}/sample_key.txt'
manifest_txt_path = f'{sample_info_path}/sample_manifest.txt'
bpm = f'{sample_info_path}/NeuroBooster_20042459_A1.bpm'
cluster_file = f'{sample_info_path}/NBSCluster_file_n1393_011921.egt'

# phenos
bcm_pheno_file = f'{sample_info_path}/BCM_20210201_samples.csv'
umd_pheno_file = f'{sample_info_path}/UMD_20210201_samples.csv'

# executables
iaap = 'executables/iaap-cli-linux-x64-1.1.0-sha.80d7e5b3d9c1fdfc2e99b472a90652fd3848bbc7/iaap-cli/iaap-cli'

In [4]:
# some rearranging
# !mkdir {shulman_ny_path}/sample_info
# !cp {gtc_file_path}/Key\ File_FINAL_Shulman_and_NY_011421.txt {sample_info_path}/sample_key.txt
# !cp {gtc_file_path}/FINALSS_after_rerun__Shulman_and_NY_011421.csv {sample_info_path}/sample_manifest.txt
# !cp {gtc_file_path}/NeuroBooster_20042459_A1.bpm {sample_info_path}/
# !cp {gtc_file_path}/NBSCluster_file_n1393_011921.egt {sample_info_path}/



In [7]:
# create updated ID and phenotype files for later
manifest = pd.read_csv(manifest_txt_path, header=10)
bcm_pheno = pd.read_csv(bcm_pheno_file)
umd_pheno = pd.read_csv(umd_pheno_file)
pheno = bcm_pheno.append(umd_pheno)

pheno['Original_clinicalID'] = pheno['Original_clinicalID'].astype(str)
manifest['Sample_ID'] = manifest['Sample_ID'].astype(str)
manifest['filename'] = manifest['SentrixBarcode_A'].astype(str) + '_' + manifest['SentrixPosition_A']
pheno_out = manifest.merge(pheno, how='left', left_on='Sample_ID', right_on='Original_clinicalID')
pheno_out['IID'] = pheno_out.SentrixBarcode_A.astype(str) + '_' + pheno_out.SentrixPosition_A.astype(str)
pheno_out['FID'] = 0
pheno_out['FID_new'] = 0
pheno_out['pheno'] = 0
pheno_out.loc[pheno_out.Phenotype == 'PD', 'pheno'] = 2
pheno_out.loc[pheno_out.Phenotype == 'Control', 'pheno'] = 1
pheno_out.loc[pheno_out.Phenotype == np.nan, 'pheno'] = 0

pheno_out[['FID','IID', 'FID_new', 'Sample_ID']].to_csv(f'{sample_info_path}/update_ids.txt', sep='\t', header=False, index=False)
pheno_out[['FID_new', 'Sample_ID', 'pheno']].to_csv(f'{sample_info_path}/update_pheno.txt', sep='\t', header=False, index=False)

In [11]:
# create a folder in idats for each plate in new idat_dir
for code in manifest.SentrixBarcode_A.unique():
    if os.path.exists(f'{idat_dir}/{code}'):
        print(f'{idat_dir}/{code} already exists')
    else:
        os.mkdir(f'{idat_dir}/{code}')

# copy idat intensity files to respective directories under idat_dir
missing_idats = []

for i, filename in enumerate(manifest.filename):
    sentrix_code = manifest.SentrixBarcode_A.iloc[i]
    grn = f'{raw_idat_file_path}/{sentrix_code}/{filename}_Grn.idat'
    red = f'{raw_idat_file_path}/{sentrix_code}/{filename}_Red.idat'

    if os.path.isfile(grn):
        shutil.copyfile(src=grn, dst=f'{idat_dir}/{sentrix_code}/{filename}_Grn.idat')
    else:
        missing_idats.append(grn)

    if os.path.isfile(red):
        shutil.copyfile(src=red, dst=f'{idat_dir}/{sentrix_code}/{filename}_Red.idat')
    else:
        missing_idats.append(red)

len(missing_idats)

0

In [22]:
with open(f'{swarm_scripts_dir}/idat_to_ped.swarm', 'w') as f:
    
    for code in manifest.SentrixBarcode_A.unique():
        
        idat_to_ped_cmd = f'\
{iaap} gencall \
{bpm} \
{cluster_file} \
{ped_dir}/ \
-f {idat_dir}/{code} \
-p \
-t 8'
        
        f.write(f'{idat_to_ped_cmd}\n')
f.close()

In [45]:
# !swarm -f {swarm_scripts_dir}/idat_to_ped.swarm -g 32 -t 16 --time=10:00:00 --logdir swarm --gres=lscratch:20 --partition=norm

12454654


In [46]:
# copy map file to match name of each ped
map_file = f'{ped_dir}/NeuroBooster_20042459_A1.map'
for filename in manifest.filename:
    ped = f'{ped_dir}/{filename}.ped'
    out_map = f'{ped_dir}/{filename}.map'
    if os.path.isfile(ped):
        shutil.copyfile(src=map_file, dst=out_map)
    else:
        print(f'{ped} does not exist!')
        print(f'{out_map} creation cancelled')


In [23]:

with open(f'{swarm_scripts_dir}/make_bed.swarm', 'w') as f:
    for filename in manifest.filename:
        ped = f'{ped_dir}/{filename}'
        make_bed_cmd = f'\
plink \
--file {ped} \
--make-bed \
--out {plink_dir}/indiv_samples/{filename}'

        f.write(f'{make_bed_cmd}\n')
f.close()


In [54]:
# !swarm -f {swarm_scripts_dir}/make_bed.swarm -g 8 -t 8 --time=10:00:00 --logdir swarm --gres=lscratch:20 --partition=norm

12455471


In [24]:
# write plink merge command
with open(f"{plink_dir}/merge_bed.list", 'w') as f:
    for filename in manifest.filename:
        bed = f'{plink_dir}/indiv_samples/{filename}'
        f.write(f'{bed}\n')
f.close()

with open(f"{swarm_scripts_dir}/merge.swarm", 'w') as f:

    plink_merge_cmd = f'\
plink \
--merge-list {plink_dir}/merge_bed.list \
--update-ids {sample_info_path}/update_ids.txt \
--make-bed \
--out {plink_dir}/indiv_samples/shulman_merge'
    f.write(f"{plink_merge_cmd}")
f.close()

In [58]:
# !swarm -f {swarm_scripts_dir}/merge.swarm -g 64 -t 32 --time=10:00:00 --logdir swarm --gres=lscratch:20 --partition=norm

12456657


In [59]:
!plink --bfile {plink_dir}/indiv_samples/shulman_merge --pheno {sample_info_path}/update_pheno.txt --make-bed --out {plink_dir}/shulman

PLINK v1.90b4.4 64-bit (21 May 2017)           www.cog-genomics.org/plink/1.9/
(C) 2005-2017 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to /data/CARD/PD/GP2/raw_genotypes/shulman_ny/plink/shulman.log.
Options in effect:
  --bfile /data/CARD/PD/GP2/raw_genotypes/shulman_ny/plink/indiv_samples/shulman_merge
  --make-bed
  --out /data/CARD/PD/GP2/raw_genotypes/shulman_ny/plink/shulman
  --pheno /data/CARD/PD/GP2/raw_genotypes/shulman_ny/sample_info/update_pheno.txt

1547809 MB RAM detected; reserving 773904 MB for main workspace.
2004347 variants loaded from .bim file.
1393 people (765 males, 601 females, 27 ambiguous) loaded from .fam.
Ambiguous sex IDs written to
/data/CARD/PD/GP2/raw_genotypes/shulman_ny/plink/shulman.nosex .
1285 phenotype values present after --pheno.
phenotypes to be ignored, use the --allow-no-sex flag.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 1393 founders and 0 nonfounders present.
Calculat

In [5]:
# run QC pipeline
geno_path = f'{basedir}/shulman_ny/plink/shulman'
out_dir = f'{out_genotypes}/shulman_ny/clean'
out_path = f'{out_dir}/shulman_ny'
os.makedirs(f'{out_dir}', exist_ok=True)
ref_dir_path = '/data/LNG/vitaled2/1kgenomes'
ref_panel = f'{ref_dir_path}/1kg_ashkj_ref_panel_gp2_pruned'
ref_labels = f'{ref_dir_path}/ref_panel_ancestry.txt'


with open(f'{swarm_scripts_dir}/run_qc_pipeline.swarm','w') as f:
    run_pipeline = f'python3 ../run_qc_pipeline.py --geno {geno_path} --ref {ref_panel} --ref_labels {ref_labels} --out {out_path}'
    f.write(f'{run_pipeline}\n')
f.close()
!cat {swarm_scripts_dir}/run_qc_pipeline.swarm

python3 ../run_qc_pipeline.py --geno /data/CARD/PD/GP2/raw_genotypes/shulman_ny/plink/shulman --ref /data/LNG/vitaled2/1kgenomes/1kg_ashkj_ref_panel_gp2_pruned --ref_labels /data/LNG/vitaled2/1kgenomes/ref_panel_ancestry.txt --out /data/CARD/PD/GP2/genotypes/shulman_ny/clean/shulman_ny


In [21]:
!swarm -f {swarm_scripts_dir}/run_qc_pipeline.swarm -g 64 -t 32 --time=10:00:00 --logdir swarm --gres=lscratch:20 --partition=norm

18227274


In [13]:
# make new dirs for QC'd data 
cohorts = ['bcm','umd']

for cohort in cohorts:
    cohort_dir = f'{out_genotypes}/{cohort}'
    clean_dir = f'{cohort_dir}/clean'
    impute_dir = f'{cohort_dir}/imputed'
    os.makedirs(cohort_dir, exist_ok=True)
    os.makedirs(clean_dir, exist_ok=True)
    os.makedirs(impute_dir, exist_ok=True)



In [8]:
labels = ['EUR', 'AMR', 'SAS', 'AJ', 'AAC', 'EAS']

bcm_pheno.Original_clinicalID = bcm_pheno.Original_clinicalID.astype(str) 
umd_pheno.Original_clinicalID = umd_pheno.Original_clinicalID.astype(str)

In [11]:
# now, split up by cohort and ancestry

for label in labels:
    bcm_label_out = f'{out_genotypes}/bcm/clean/bcm_{label}'
    umd_label_out = f'{out_genotypes}/umd/clean/umd_{label}'
    geno_path = f'{out_genotypes}/shulman_ny/clean/shulman_ny_{label}'
    fam_path = f'{geno_path}.fam'
    fam = pd.read_csv(fam_path, sep='\s+', header=None, names=['FID','IID','PAT','MAT','sex','pheno'])
    fam.IID = fam.IID.astype(str)
    
    bcm_label = fam.merge(bcm_pheno, how='inner', left_on='IID', right_on='Original_clinicalID')
    umd_label = fam.merge(umd_pheno, how='inner', left_on='IID', right_on='Original_clinicalID')
    
    print(label)
    bcm_label[['FID','IID']].to_csv(f'{bcm_label_out}.samples', header=False, index=False, sep='\t')
    print(f'BCM: {bcm_label.shape}')
    umd_label[['FID','IID']].to_csv(f'{umd_label_out}.samples', header=False, index=False, sep='\t')
    print(f'UMD: {umd_label.shape}')
    print()
    bcm_plink_cmd = f'plink --bfile {geno_path} --keep {bcm_label_out}.samples --make-bed --out {bcm_label_out}'
    umd_plink_cmd = f'plink --bfile {geno_path} --keep {umd_label_out}.samples --make-bed --out {umd_label_out}'

    cmds = [bcm_plink_cmd, umd_plink_cmd]
    for cmd in cmds:
        shell_do(cmd)




EUR
BCM: (638, 16)
UMD: (368, 16)



Executing: plink --bfile /data/CARD/PD/GP2/genotypes/shulman_ny/clean/shulman_ny_EUR --keep /data/CARD/PD/GP2/genotypes/bcm/clean/bcm_EUR.samples --make-bed --out /data/CARD/PD/GP2/genotypes/bcm/clean/bcm_EUR
Executing: plink --bfile /data/CARD/PD/GP2/genotypes/shulman_ny/clean/shulman_ny_EUR --keep /data/CARD/PD/GP2/genotypes/umd/clean/umd_EUR.samples --make-bed --out /data/CARD/PD/GP2/genotypes/umd/clean/umd_EUR


AMR
BCM: (77, 16)
UMD: (7, 16)



Executing: plink --bfile /data/CARD/PD/GP2/genotypes/shulman_ny/clean/shulman_ny_AMR --keep /data/CARD/PD/GP2/genotypes/bcm/clean/bcm_AMR.samples --make-bed --out /data/CARD/PD/GP2/genotypes/bcm/clean/bcm_AMR
Executing: plink --bfile /data/CARD/PD/GP2/genotypes/shulman_ny/clean/shulman_ny_AMR --keep /data/CARD/PD/GP2/genotypes/umd/clean/umd_AMR.samples --make-bed --out /data/CARD/PD/GP2/genotypes/umd/clean/umd_AMR


SAS
BCM: (17, 16)
UMD: (6, 16)



Executing: plink --bfile /data/CARD/PD/GP2/genotypes/shulman_ny/clean/shulman_ny_SAS --keep /data/CARD/PD/GP2/genotypes/bcm/clean/bcm_SAS.samples --make-bed --out /data/CARD/PD/GP2/genotypes/bcm/clean/bcm_SAS
Executing: plink --bfile /data/CARD/PD/GP2/genotypes/shulman_ny/clean/shulman_ny_SAS --keep /data/CARD/PD/GP2/genotypes/umd/clean/umd_SAS.samples --make-bed --out /data/CARD/PD/GP2/genotypes/umd/clean/umd_SAS


AJ
BCM: (27, 16)
UMD: (57, 16)



Executing: plink --bfile /data/CARD/PD/GP2/genotypes/shulman_ny/clean/shulman_ny_AJ --keep /data/CARD/PD/GP2/genotypes/bcm/clean/bcm_AJ.samples --make-bed --out /data/CARD/PD/GP2/genotypes/bcm/clean/bcm_AJ
Executing: plink --bfile /data/CARD/PD/GP2/genotypes/shulman_ny/clean/shulman_ny_AJ --keep /data/CARD/PD/GP2/genotypes/umd/clean/umd_AJ.samples --make-bed --out /data/CARD/PD/GP2/genotypes/umd/clean/umd_AJ


AAC
BCM: (24, 16)
UMD: (14, 16)



Executing: plink --bfile /data/CARD/PD/GP2/genotypes/shulman_ny/clean/shulman_ny_AAC --keep /data/CARD/PD/GP2/genotypes/bcm/clean/bcm_AAC.samples --make-bed --out /data/CARD/PD/GP2/genotypes/bcm/clean/bcm_AAC
Executing: plink --bfile /data/CARD/PD/GP2/genotypes/shulman_ny/clean/shulman_ny_AAC --keep /data/CARD/PD/GP2/genotypes/umd/clean/umd_AAC.samples --make-bed --out /data/CARD/PD/GP2/genotypes/umd/clean/umd_AAC


EAS
BCM: (7, 16)
UMD: (4, 16)



Executing: plink --bfile /data/CARD/PD/GP2/genotypes/shulman_ny/clean/shulman_ny_EAS --keep /data/CARD/PD/GP2/genotypes/bcm/clean/bcm_EAS.samples --make-bed --out /data/CARD/PD/GP2/genotypes/bcm/clean/bcm_EAS
Executing: plink --bfile /data/CARD/PD/GP2/genotypes/shulman_ny/clean/shulman_ny_EAS --keep /data/CARD/PD/GP2/genotypes/umd/clean/umd_EAS.samples --make-bed --out /data/CARD/PD/GP2/genotypes/umd/clean/umd_EAS


In [10]:
# run imputation

# these files need to be in a different place eventually
ref_panel='/data/vitaled2/GenoTools/ref/PASS.Variantsbravo-dbsnp-all.tab'
check_bim_pl = '/data/vitaled2/GenoTools/ref/HRC-1000G-check-bim.pl'

imputed_out_dir = f'{out_genotypes}/shulman_ny/imputed' 
os.makedirs(imputed_out_dir, exist_ok=True)

# jenky method for pulling cleaned genos with ancestry labels for imputation... should figure out how to do this better later
impute_genos_list = [x.split('.')[0] for x in glob.glob(f'{out_dir}/*.bed')]

impute_labels_list = [x.split('/')[-1].replace('.bed','').split('_')[-1] for x in impute_genos_list]

with open(f'{swarm_scripts_dir}/run_imputation_pipeline.swarm','w') as f:
    for geno, label in zip(impute_genos_list, impute_labels_list):

        label_temp_outdir = f'{basedir}/shulman_ny/plink/{label}'
        label_temp = f'{label_temp_outdir}/{label}'
        label_outdir = f'{imputed_out_dir}/{label}'
        os.makedirs(f'{label_outdir}', exist_ok=True)
        os.makedirs(f'{label_temp_outdir}', exist_ok=True)
        impute_data = impute_data_prep(geno, label_temp, ref_panel, check_bim_pl)
        run_pipeline = f'python3 ../run_imputation_pipeline.py --geno {label_temp} --token {config.api_key} --out {label_outdir}/'
        f.write(f'{run_pipeline}\n')
        
f.close()

!cat {swarm_scripts_dir}/run_imputation_pipeline.swarm

python3 ../run_imputation_pipeline.py --geno /data/CARD/PD/GP2/raw_genotypes/shulman_ny/plink/AMR/AMR --token eyJjdHkiOiJ0ZXh0XC9wbGFpbiIsImFsZyI6IkhTMjU2In0.eyJtYWlsIjoidml0YWxlZDJAbmloLmdvdiIsImV4cGlyZSI6MTYyNjQwNjczMzc5OCwibmFtZSI6IkRhbiBWaXRhbGUiLCJhcGkiOnRydWUsInVzZXJuYW1lIjoidml0YWxlZDIifQ.hcHyBgJmcTZDEpFnb8t5gH1lfxSZQZHC4Lu9IhN0E18 --out /data/CARD/PD/GP2/genotypes/shulman_ny/imputed/AMR/
python3 ../run_imputation_pipeline.py --geno /data/CARD/PD/GP2/raw_genotypes/shulman_ny/plink/AJ/AJ --token eyJjdHkiOiJ0ZXh0XC9wbGFpbiIsImFsZyI6IkhTMjU2In0.eyJtYWlsIjoidml0YWxlZDJAbmloLmdvdiIsImV4cGlyZSI6MTYyNjQwNjczMzc5OCwibmFtZSI6IkRhbiBWaXRhbGUiLCJhcGkiOnRydWUsInVzZXJuYW1lIjoidml0YWxlZDIifQ.hcHyBgJmcTZDEpFnb8t5gH1lfxSZQZHC4Lu9IhN0E18 --out /data/CARD/PD/GP2/genotypes/shulman_ny/imputed/AJ/
python3 ../run_imputation_pipeline.py --geno /data/CARD/PD/GP2/raw_genotypes/shulman_ny/plink/EAS/EAS --token eyJjdHkiOiJ0ZXh0XC9wbGFpbiIsImFsZyI6IkhTMjU2In0.eyJtYWlsIjoidml0YWxlZDJAbmloLmdvdiIsImV4cGlyZSI

In [None]:
!swarm -f {swarm_scripts_dir}/run_imputation_pipeline.swarm -g 20 -t 16 --time=80:00:00 --logdir swarm --gres=lscratch:20 --partition=norm

In [11]:
# now, convert imputed genos to plink2
imputed_out_dir = f'{out_genotypes}/shulman_ny/imputed'

chroms = [str(x) for x in range(1,23)] + ['X']

with open(f'{swarm_scripts_dir}/unzip_imputed.swarm', 'w') as f:
    for label in labels:
        zipdir = f'{imputed_out_dir}/{label}'
        for chrom in chroms:
            zipfile = f'chr_{chrom}.zip'
            unzip_cmd = f'cd {zipdir}; unzip -P imputer {zipfile}'
            f.write(f'{unzip_cmd}\n')
f.close()


In [39]:
!swarm -f {swarm_scripts_dir}/unzip_imputed.swarm -g 20 -t 16 --time=80:00:00 --logdir swarm --gres=lscratch:20 --partition=norm

18612759


In [12]:
with open(f'{swarm_scripts_dir}/make_pgen.swarm', 'w') as f:
    for label in labels:
        for chrom in chroms:

            vcf_in = f'{out_genotypes}/shulman_ny/imputed/{label}/chr{chrom}.dose.vcf.gz'
            pgen_out = f'{out_genotypes}/shulman_ny/imputed/{label}/chr{chrom}'
            
            make_pgen_cmd = f"plink2 --vcf {vcf_in} 'dosage=HDS' --make-pgen --out {pgen_out}"
            
            f.write(f'{make_pgen_cmd}\n')
f.close()
            


In [17]:
!swarm -f {swarm_scripts_dir}/make_pgen.swarm -g 32 -t 32 --time=10:00:00 --logdir swarm --module plink/2.3-alpha --gres=lscratch:20 --partition=norm

18658132


In [13]:
with open(f'{swarm_scripts_dir}/pgen_update_info.swarm', 'w') as f:

    fam_path = f'{geno_path}.fam'
    fam = pd.read_csv(fam_path, sep='\s+', header=None, names=['FID','IID','PAT','MAT','sex','pheno'])
    
    for label in labels:
        for chrom in chroms:
            basepath = f'{out_genotypes}/shulman_ny/imputed/{label}'
            pgen_path = f'{basepath}/chr{chrom}'
            pgen_out = f'{basepath}/shulman_ny_{label}_chr{chrom}'
            
            psam = pd.read_csv(f'{pgen_path}.psam', sep='\s+')
            psam.rename(columns={'#IID':'IID'}, inplace=True)
            psam.loc[:,'IID2'] = psam.IID.str.replace('0_','')
            psam.loc[:,['IID','IID2']].to_csv(f'{pgen_path}.update_ids', sep='\t', header=False, index=False)
            psam_fam = psam.merge(fam, how='left', left_on='IID2', right_on='IID')
            psam_fam[['FID', 'IID_y', 'sex']].to_csv(f'{pgen_path}.sex', sep='\t', header=False, index=False)
            psam_fam[['FID', 'IID_y', 'pheno']].to_csv(f'{pgen_path}.pheno', sep='\t', header=False, index=False)
            
            plink_cmd1 = f"\
plink2 --pfile {pgen_path} \
--update-ids {pgen_path}.update_ids \
--make-pgen \
--out {pgen_path}_tmp"
            
            plink_cmd2 = f"\
plink2 --pfile {pgen_path}_tmp \
--update-sex {pgen_path}.sex \
--pheno {pgen_path}.pheno \
--make-pgen \
--out {pgen_out}"

            

            f.write(f'{plink_cmd1} && {plink_cmd2}\n')
f.close()


In [44]:
!swarm -f {swarm_scripts_dir}/pgen_update_info.swarm -g 32 -t 32 --time=10:00:00 --logdir swarm --module plink/2.3-alpha --gres=lscratch:20 --partition=norm

18706935


In [14]:
with open(f'{swarm_scripts_dir}/split_cohort_pgen.swarm', 'w') as f: 
    for label in labels:
        basepath = f'{out_genotypes}/shulman_ny/imputed/{label}'
        for chrom in chroms:

            bcm_label_path = f'{out_genotypes}/bcm/imputed/{label}'
            umd_label_path = f'{out_genotypes}/umd/imputed/{label}'
            bcm_label_out = f'{bcm_label_path}/bcm_{label}_chr{chrom}'
            umd_label_out = f'{umd_label_path}/umd_{label}_chr{chrom}'

            pgen_path = f'{basepath}/shulman_ny_{label}_chr{chrom}'
            psam = pd.read_csv(f'{pgen_path}.psam', sep='\s+')
            psam.rename(columns={'#IID':'IID'}, inplace=True)
            psam.IID = psam.IID.astype(str)

            os.makedirs(bcm_label_path, exist_ok=True)
            os.makedirs(umd_label_path, exist_ok=True)

            bcm_label = psam.merge(bcm_pheno, how='inner', left_on='IID', right_on='Original_clinicalID')
            umd_label = psam.merge(umd_pheno, how='inner', left_on='IID', right_on='Original_clinicalID')


            bcm_label[['IID']].to_csv(f'{bcm_label_out}.samples', header=False, index=False, sep='\t')
            umd_label[['IID']].to_csv(f'{umd_label_out}.samples', header=False, index=False, sep='\t')
            bcm_plink_cmd = f'plink2 --pfile {pgen_path} --keep {bcm_label_out}.samples --make-pgen --out {bcm_label_out}'
            umd_plink_cmd = f'plink2 --pfile {pgen_path} --keep {umd_label_out}.samples --make-pgen --out {umd_label_out}'
            
            cmds = [bcm_plink_cmd, umd_plink_cmd]
            
            for cmd in cmds:
                f.write(f'{cmd}\n')
f.close()


In [20]:
!swarm -f {swarm_scripts_dir}/split_cohort_pgen.swarm -g 8 -t 8 --time=10:00:00 --logdir swarm --module plink/2.3-alpha --gres=lscratch:20 --partition=norm

18769913


In [31]:
for label in labels:
    print(label)
    print('BCM:')
    !cat {out_genotypes}/bcm/imputed/{label}/bcm_{label}_chrX.pvar | wc -l
    print('UMD:')
    !cat {out_genotypes}/umd/imputed/{label}/umd_{label}_chrX.pvar | wc -l
    print()

EUR
BCM:
13863122
UMD:
13863122

AMR
BCM:
15331055
UMD:
15331055

SAS
BCM:
9794453
UMD:
9794453

AJ
BCM:
13762228
UMD:
13762228

AAC
BCM:
15331055
UMD:
15331055

EAS
BCM:
4993566
UMD:
4993566



In [30]:
for label in labels:
    !echo {out_genotypes}/bcm/imputed/{label}/

/data/CARD/PD/GP2/genotypes/bcm/imputed/EUR/
/data/CARD/PD/GP2/genotypes/bcm/imputed/AMR/
/data/CARD/PD/GP2/genotypes/bcm/imputed/SAS/
/data/CARD/PD/GP2/genotypes/bcm/imputed/AJ/
/data/CARD/PD/GP2/genotypes/bcm/imputed/AAC/
/data/CARD/PD/GP2/genotypes/bcm/imputed/EAS/
