In [1]:
import pandas as pd
import os
import shutil
import numpy as np
import glob
import subprocess

from QC.utils import shell_do
import QC.config as config
from QC.imputation import impute_data_prep

In [2]:
basedir = '/data/CARD/PD/GP2/raw_genotypes'
out_genotypes = '/data/CARD/PD/GP2/genotypes'
cohort_path = f'{basedir}/coriell'
sample_info_path = f'{cohort_path}/sample_info'
ilmn_files_path = '/data/CARD/PD/GP2/ilmn_files'
bpm = f'{ilmn_files_path}/NeuroBooster_20042459_A1.bpm'
egt = f'{ilmn_files_path}/NBSCluster_file_n1393_011921.egt'
key_file = f'{sample_info_path}/KEY_FILE_GP2_Coriell_P1_25_042621_B1.txt'
pheno_file = f'{sample_info_path}/CORIELL_20210201_clinical.csv'

ped_dir = f'{cohort_path}/ped'
bed_dir = f'{cohort_path}/bed'
swarm_scripts_dir = f'{cohort_path}/swarm_scripts'
idat_dir = f'{cohort_path}/idats'
raw_idat_file_path = f'{cohort_path}/idats/GP2_coriell_idats'

# create ped and plink directories for raw geno outputs if they don't exist
os.makedirs(ped_dir, exist_ok=True)
os.makedirs(bed_dir, exist_ok=True)
os.makedirs(f'{bed_dir}/indiv_samples', exist_ok=True)
os.makedirs(swarm_scripts_dir, exist_ok=True)
os.makedirs(idat_dir, exist_ok=True)

iaap = f'{ilmn_files_path}/iaap-cli/iaap-cli'

In [5]:
pheno = pd.read_csv(pheno_file)
key = pd.read_csv(key_file, sep='\t')


pheno['Original_clinicalID'] = pheno['Original_clinicalID'].astype(str)
key['Sample_ID'] = key['Sample_ID'].astype(str)
key['filename'] = key['SentrixBarcode_A'].astype(str) + '_' + key['SentrixPosition_A']
pheno_out = key.merge(pheno, how='inner', left_on='Sample_ID', right_on='Original_clinicalID')
pheno_out['IID'] = pheno_out.SentrixBarcode_A.astype(str) + '_' + pheno_out.SentrixPosition_A.astype(str)
pheno_out['FID'] = 0
pheno_out['FID_new'] = 0
pheno_out['pheno'] = 0
pheno_out.loc[pheno_out.Phenotype == 'PD', 'pheno'] = 2
pheno_out.loc[pheno_out.Phenotype == 'Control', 'pheno'] = 1
pheno_out.loc[pheno_out.Phenotype == np.nan, 'pheno'] = 0

pheno_out[['FID','IID', 'FID_new', 'Sample_ID']].to_csv(f'{sample_info_path}/update_ids.txt', sep='\t', header=False, index=False)
pheno_out[['FID_new', 'Sample_ID', 'pheno']].to_csv(f'{sample_info_path}/update_pheno.txt', sep='\t', header=False, index=False)

In [4]:
# create a folder in idats for each plate in new idat_dir
for code in key.SentrixBarcode_A.unique():
    os.makedirs(f'{idat_dir}/{code}', exist_ok=True)

# copy idat intensity files to respective directories under idat_dir
missing_idats = []

for i, filename in enumerate(key.filename):
    sentrix_code = key.SentrixBarcode_A.iloc[i]
    grn = f'{raw_idat_file_path}/{filename}_Grn.idat'
    red = f'{raw_idat_file_path}/{filename}_Red.idat'

    if os.path.isfile(grn):
        shutil.copyfile(src=grn, dst=f'{idat_dir}/{sentrix_code}/{filename}_Grn.idat')
    else:
        missing_idats.append(grn)

    if os.path.isfile(red):
        shutil.copyfile(src=red, dst=f'{idat_dir}/{sentrix_code}/{filename}_Red.idat')
    else:
        missing_idats.append(red)

len(missing_idats)

KeyboardInterrupt: 

In [None]:
with open(f'{swarm_scripts_dir}/idat_to_ped.swarm', 'w') as f:
    
    for code in key.SentrixBarcode_A.unique():
        
        idat_to_ped_cmd = f'\
{iaap} gencall \
{bpm} \
{egt} \
{ped_dir}/ \
-f {idat_dir}/{code} \
-p \
-t 8'
        
        f.write(f'{idat_to_ped_cmd}\n')
f.close()


In [None]:
!swarm -f {swarm_scripts_dir}/idat_to_ped.swarm -g 32 -t 16 --time=10:00:00 --logdir {swarm_scripts_dir}/logs --gres=lscratch:20 --partition=norm

In [58]:
# copy map file to match name of each ped
map_file = f'{ped_dir}/NeuroBooster_20042459_A1.map'
for filename in key.filename:
    ped = f'{ped_dir}/{filename}.ped'
    out_map = f'{ped_dir}/{filename}.map'
    if os.path.isfile(ped):
        shutil.copyfile(src=map_file, dst=out_map)
    else:
        print(f'{ped} does not exist!')
        print(f'{out_map} creation cancelled')

In [7]:

with open(f'{swarm_scripts_dir}/make_bed.swarm', 'w') as f:
    for filename in key.filename:
        ped = f'{ped_dir}/{filename}'
        make_bed_cmd = f'\
plink \
--file {ped} \
--make-bed \
--out {bed_dir}/indiv_samples/{filename}'

        f.write(f'{make_bed_cmd}\n')
f.close()


In [8]:
!swarm -f {swarm_scripts_dir}/make_bed.swarm -g 64 -t 32 --time=10:00:00 --logdir swarm --gres=lscratch:20 --partition=norm

17032585


In [9]:
# write plink merge command
with open(f"{bed_dir}/merge_bed.list", 'w') as f:
    for filename in key.filename:
        bed = f'{bed_dir}/indiv_samples/{filename}'
        f.write(f'{bed}\n')
f.close()

with open(f"{swarm_scripts_dir}/merge.swarm", 'w') as f:

    plink_merge_cmd = f'\
plink \
--merge-list {bed_dir}/merge_bed.list \
--update-ids {sample_info_path}/update_ids.txt \
--make-bed \
--out {bed_dir}/indiv_samples/coriell_merge'
    f.write(f"{plink_merge_cmd}")
f.close()

In [10]:
!swarm -f {swarm_scripts_dir}/merge.swarm -g 64 -t 32 --time=10:00:00 --logdir swarm --gres=lscratch:20 --partition=norm

17036332


In [11]:
!plink --bfile {bed_dir}/indiv_samples/coriell_merge --pheno {sample_info_path}/update_pheno.txt --make-bed --out {bed_dir}/coriell

PLINK v1.90b4.4 64-bit (21 May 2017)           www.cog-genomics.org/plink/1.9/
(C) 2005-2017 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to /data/CARD/PD/GP2/raw_genotypes/coriell/bed/coriell.log.
Options in effect:
  --bfile /data/CARD/PD/GP2/raw_genotypes/coriell/bed/indiv_samples/coriell_merge
  --make-bed
  --out /data/CARD/PD/GP2/raw_genotypes/coriell/bed/coriell
  --pheno /data/CARD/PD/GP2/raw_genotypes/coriell/sample_info/update_pheno.txt

1547809 MB RAM detected; reserving 773904 MB for main workspace.
2004347 variants loaded from .bim file.
2304 people (1307 males, 941 females, 56 ambiguous) loaded from .fam.
Ambiguous sex IDs written to
/data/CARD/PD/GP2/raw_genotypes/coriell/bed/coriell.nosex .
2304 phenotype values present after --pheno.
phenotypes to be ignored, use the --allow-no-sex flag.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 2304 founders and 0 nonfounders present.
Calculating allele frequencies

In [3]:
# run QC pipeline
geno_path = f'{basedir}/coriell/bed/coriell'
out_dir = f'{out_genotypes}/coriell/clean'
out_path = f'{out_dir}/coriell'
os.makedirs(f'{out_dir}', exist_ok=True)
ref_dir_path = '/data/LNG/vitaled2/1kgenomes'
ref_panel = f'{ref_dir_path}/1kg_ashkj_ref_panel_gp2_pruned'
ref_labels = f'{ref_dir_path}/ref_panel_ancestry.txt'


with open(f'{swarm_scripts_dir}/run_qc_pipeline.swarm','w') as f:
    run_pipeline = f'python3 ../run_qc_pipeline.py --geno {geno_path} --ref {ref_panel} --ref_labels {ref_labels} --out {out_path}'
    f.write(f'{run_pipeline}\n')
f.close()
!cat {swarm_scripts_dir}/run_qc_pipeline.swarm

python3 ../run_qc_pipeline.py --geno /data/CARD/PD/GP2/raw_genotypes/coriell/bed/coriell --ref /data/LNG/vitaled2/1kgenomes/1kg_ashkj_ref_panel_gp2_pruned --ref_labels /data/LNG/vitaled2/1kgenomes/ref_panel_ancestry.txt --out /data/CARD/PD/GP2/genotypes/coriell/clean/coriell


In [4]:
# !swarm -f {swarm_scripts_dir}/run_qc_pipeline.swarm -g 64 -t 32 --time=10:00:00 --logdir swarm --gres=lscratch:20 --partition=norm

In [13]:
# run imputation

# these files need to be in a different place eventually
ref_panel='/data/vitaled2/GenoTools/ref/PASS.Variantsbravo-dbsnp-all.tab'
check_bim_pl = '/data/vitaled2/GenoTools/ref/HRC-1000G-check-bim.pl'

imputed_out_dir = f'{out_genotypes}/coriell/imputed' 
os.makedirs(imputed_out_dir, exist_ok=True)

# jenky method for pulling cleaned genos with ancestry labels for imputation... should figure out how to do this better later
impute_genos_list = [x.split('.')[0] for x in glob.glob(f'{out_dir}/*.bed')]

impute_labels_list = [x.split('/')[-1].replace('.bed','').split('_')[-1] for x in impute_genos_list]

with open(f'{swarm_scripts_dir}/run_imputation_pipeline.swarm','w') as f:
    for geno, label in zip(impute_genos_list, impute_labels_list):

        label_temp_outdir = f'{basedir}/coriell/bed/{label}'
        label_temp = f'{label_temp_outdir}/{label}'
        label_outdir = f'{imputed_out_dir}/{label}'
        os.makedirs(f'{label_outdir}', exist_ok=True)
        os.makedirs(f'{label_temp_outdir}', exist_ok=True)
        impute_data = impute_data_prep(geno, label_temp, ref_panel, check_bim_pl)
        run_pipeline = f'python3 ../run_imputation_pipeline.py --geno {label_temp} --token {config.api_key} --out {label_outdir}/'
#         run_pipeline = f'python3 ../run_imputation_pipeline.py --geno {geno} --temp {label_temp} --token {config.api_key} --ref_panel {ref_panel} --check_bim_pl {check_bim_pl} --out {label_outdir}/'
        f.write(f'{run_pipeline}\n')
        
f.close()


    
# !cat {swarm_scripts_dir}/run_imputation_pipeline.swarm

Executing: plink --bfile coriell_EAS --freq --out EAS


PLINK v1.90b4.4 64-bit (21 May 2017)           www.cog-genomics.org/plink/1.9/
(C) 2005-2017 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to EAS.log.
Options in effect:
  --bfile coriell_EAS
  --freq
  --out EAS

1547809 MB RAM detected; reserving 773904 MB for main workspace.
1946227 variants loaded from .bim file.
14 people (11 males, 3 females) loaded from .fam.
14 phenotype values loaded from .fam.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 14 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
--freq: Allele frequencies (founders only) written to EAS.frq .



Executing: perl HRC-1000G-check-bim.pl -b coriell_EAS.bim -f EAS.frq -r PASS.Variantsbravo-dbsnp-all.tab -h




         Script to check plink .bim files against HRC/1000G for
        strand, id names, positions, alleles, ref/alt assignment
                         William Rayner 2015
                        wrayner@well.ox.ac.uk

                             Version 4.2.5


Options Set:
Reference Panel:             HRC
Bim filename:                coriell_EAS.bim
Reference filename:          PASS.Variantsbravo-dbsnp-all.tab
Allele frequencies filename: EAS.frq
Allele frequency threshold:  0.2


Reading PASS.Variantsbravo-dbsnp-all.tab
 100000 200000 300000 400000 500000 600000 700000 800000 900000 1000000 1100000 1200000 1300000 1400000 1500000 1600000 1700000 1800000 1900000 2000000 2100000 2200000 2300000 2400000 2500000 2600000 2700000 2800000 2900000 3000000 3100000 3200000 3300000 3400000 3500000 3600000 3700000 3800000 3900000 4000000 4100000 4200000 4300000 4400000 4500000 4600000 4700000 4800000 4900000 5000000 5100000 5200000 5300000 5400000 5500000 5600000 5700000 5800000 5900000 60

Executing: sh Run-plink.sh


PLINK v1.90b4.4 64-bit (21 May 2017)           www.cog-genomics.org/plink/1.9/
(C) 2005-2017 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to TEMP1.log.
Options in effect:
  --bfile coriell_EAS
  --exclude Exclude-coriell_EAS-HRC.txt
  --make-bed
  --out TEMP1

1547809 MB RAM detected; reserving 773904 MB for main workspace.
1946227 variants loaded from .bim file.
14 people (11 males, 3 females) loaded from .fam.
14 phenotype values loaded from .fam.
--exclude: 1108512 variants remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 14 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
1108512 variants and 14 people pass filters and QC.
Among remaining phenotypes, 9 are cases and 5 are controls.
--make-bed to TEMP1.bed

Executing: plink --bfile coriell_EAS-updated-chr1 --recode vcf --chr 1 --out EAS_chr1
Executing: plink --bfile coriell_EAS-updated-chr2 --recode vcf --chr 2 --out EAS_chr2
Executing: plink --bfile coriell_EAS-updated-chr3 --recode vcf --chr 3 --out EAS_chr3
Executing: plink --bfile coriell_EAS-updated-chr4 --recode vcf --chr 4 --out EAS_chr4
Executing: plink --bfile coriell_EAS-updated-chr5 --recode vcf --chr 5 --out EAS_chr5
Executing: plink --bfile coriell_EAS-updated-chr6 --recode vcf --chr 6 --out EAS_chr6
Executing: plink --bfile coriell_EAS-updated-chr7 --recode vcf --chr 7 --out EAS_chr7
Executing: plink --bfile coriell_EAS-updated-chr8 --recode vcf --chr 8 --out EAS_chr8
Executing: plink --bfile coriell_EAS-updated-chr9 --recode vcf --chr 9 --out EAS_chr9
Executing: plink --bfile coriell_EAS-updated-chr10 --recode vcf --chr 10 --out EAS_chr10
Executing: plink --bfile coriell_EAS-updated-chr11 --recode vcf --chr 11 --out EAS_chr11
Executing: plink --bfile coriell_EAS-updated-chr

PLINK v1.90b4.4 64-bit (21 May 2017)           www.cog-genomics.org/plink/1.9/
(C) 2005-2017 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to AJ.log.
Options in effect:
  --bfile coriell_AJ
  --freq
  --out AJ

1547809 MB RAM detected; reserving 773904 MB for main workspace.
1959435 variants loaded from .bim file.
262 people (165 males, 97 females) loaded from .fam.
262 phenotype values loaded from .fam.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 262 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
Total genotyping rate is 0.998536.
--freq: Allele frequencies (founders only) written to AJ.frq .



Executing: perl HRC-1000G-check-bim.pl -b coriell_AJ.bim -f AJ.frq -r PASS.Variantsbravo-dbsnp-all.tab -h




         Script to check plink .bim files against HRC/1000G for
        strand, id names, positions, alleles, ref/alt assignment
                         William Rayner 2015
                        wrayner@well.ox.ac.uk

                             Version 4.2.5


Options Set:
Reference Panel:             HRC
Bim filename:                coriell_AJ.bim
Reference filename:          PASS.Variantsbravo-dbsnp-all.tab
Allele frequencies filename: AJ.frq
Allele frequency threshold:  0.2


Reading PASS.Variantsbravo-dbsnp-all.tab
 100000 200000 300000 400000 500000 600000 700000 800000 900000 1000000 1100000 1200000 1300000 1400000 1500000 1600000 1700000 1800000 1900000 2000000 2100000 2200000 2300000 2400000 2500000 2600000 2700000 2800000 2900000 3000000 3100000 3200000 3300000 3400000 3500000 3600000 3700000 3800000 3900000 4000000 4100000 4200000 4300000 4400000 4500000 4600000 4700000 4800000 4900000 5000000 5100000 5200000 5300000 5400000 5500000 5600000 5700000 5800000 5900000 6000

Executing: sh Run-plink.sh


PLINK v1.90b4.4 64-bit (21 May 2017)           www.cog-genomics.org/plink/1.9/
(C) 2005-2017 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to TEMP1.log.
Options in effect:
  --bfile coriell_AJ
  --exclude Exclude-coriell_AJ-HRC.txt
  --make-bed
  --out TEMP1

1547809 MB RAM detected; reserving 773904 MB for main workspace.
1959435 variants loaded from .bim file.
262 people (165 males, 97 females) loaded from .fam.
262 phenotype values loaded from .fam.
--exclude: 1251473 variants remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 262 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
Total genotyping rate is 0.998626.
1251473 variants and 262 people pass filters and QC.
Among remaining phenotypes, 196 are cases an

Executing: plink --bfile coriell_AJ-updated-chr1 --recode vcf --chr 1 --out AJ_chr1
Executing: plink --bfile coriell_AJ-updated-chr2 --recode vcf --chr 2 --out AJ_chr2
Executing: plink --bfile coriell_AJ-updated-chr3 --recode vcf --chr 3 --out AJ_chr3
Executing: plink --bfile coriell_AJ-updated-chr4 --recode vcf --chr 4 --out AJ_chr4
Executing: plink --bfile coriell_AJ-updated-chr5 --recode vcf --chr 5 --out AJ_chr5
Executing: plink --bfile coriell_AJ-updated-chr6 --recode vcf --chr 6 --out AJ_chr6
Executing: plink --bfile coriell_AJ-updated-chr7 --recode vcf --chr 7 --out AJ_chr7
Executing: plink --bfile coriell_AJ-updated-chr8 --recode vcf --chr 8 --out AJ_chr8
Executing: plink --bfile coriell_AJ-updated-chr9 --recode vcf --chr 9 --out AJ_chr9
Executing: plink --bfile coriell_AJ-updated-chr10 --recode vcf --chr 10 --out AJ_chr10
Executing: plink --bfile coriell_AJ-updated-chr11 --recode vcf --chr 11 --out AJ_chr11
Executing: plink --bfile coriell_AJ-updated-chr12 --recode vcf --chr 1

PLINK v1.90b4.4 64-bit (21 May 2017)           www.cog-genomics.org/plink/1.9/
(C) 2005-2017 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to AMR.log.
Options in effect:
  --bfile coriell_AMR
  --freq
  --out AMR

1547809 MB RAM detected; reserving 773904 MB for main workspace.
1962630 variants loaded from .bim file.
91 people (48 males, 43 females) loaded from .fam.
91 phenotype values loaded from .fam.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 91 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
Total genotyping rate is 0.998781.
--freq: Allele frequencies (founders only) written to AMR.frq .



Executing: perl HRC-1000G-check-bim.pl -b coriell_AMR.bim -f AMR.frq -r PASS.Variantsbravo-dbsnp-all.tab -h




         Script to check plink .bim files against HRC/1000G for
        strand, id names, positions, alleles, ref/alt assignment
                         William Rayner 2015
                        wrayner@well.ox.ac.uk

                             Version 4.2.5


Options Set:
Reference Panel:             HRC
Bim filename:                coriell_AMR.bim
Reference filename:          PASS.Variantsbravo-dbsnp-all.tab
Allele frequencies filename: AMR.frq
Allele frequency threshold:  0.2


Reading PASS.Variantsbravo-dbsnp-all.tab
 100000 200000 300000 400000 500000 600000 700000 800000 900000 1000000 1100000 1200000 1300000 1400000 1500000 1600000 1700000 1800000 1900000 2000000 2100000 2200000 2300000 2400000 2500000 2600000 2700000 2800000 2900000 3000000 3100000 3200000 3300000 3400000 3500000 3600000 3700000 3800000 3900000 4000000 4100000 4200000 4300000 4400000 4500000 4600000 4700000 4800000 4900000 5000000 5100000 5200000 5300000 5400000 5500000 5600000 5700000 5800000 5900000 60

Executing: sh Run-plink.sh


PLINK v1.90b4.4 64-bit (21 May 2017)           www.cog-genomics.org/plink/1.9/
(C) 2005-2017 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to TEMP1.log.
Options in effect:
  --bfile coriell_AMR
  --exclude Exclude-coriell_AMR-HRC.txt
  --make-bed
  --out TEMP1

1547809 MB RAM detected; reserving 773904 MB for main workspace.
1962630 variants loaded from .bim file.
91 people (48 males, 43 females) loaded from .fam.
91 phenotype values loaded from .fam.
--exclude: 1266897 variants remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 91 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
Total genotyping rate is 0.998853.
1266897 variants and 91 people pass filters and QC.
Among remaining phenotypes, 71 are cases and 20

Executing: plink --bfile coriell_AMR-updated-chr1 --recode vcf --chr 1 --out AMR_chr1
Executing: plink --bfile coriell_AMR-updated-chr2 --recode vcf --chr 2 --out AMR_chr2
Executing: plink --bfile coriell_AMR-updated-chr3 --recode vcf --chr 3 --out AMR_chr3
Executing: plink --bfile coriell_AMR-updated-chr4 --recode vcf --chr 4 --out AMR_chr4
Executing: plink --bfile coriell_AMR-updated-chr5 --recode vcf --chr 5 --out AMR_chr5
Executing: plink --bfile coriell_AMR-updated-chr6 --recode vcf --chr 6 --out AMR_chr6
Executing: plink --bfile coriell_AMR-updated-chr7 --recode vcf --chr 7 --out AMR_chr7
Executing: plink --bfile coriell_AMR-updated-chr8 --recode vcf --chr 8 --out AMR_chr8
Executing: plink --bfile coriell_AMR-updated-chr9 --recode vcf --chr 9 --out AMR_chr9
Executing: plink --bfile coriell_AMR-updated-chr10 --recode vcf --chr 10 --out AMR_chr10
Executing: plink --bfile coriell_AMR-updated-chr11 --recode vcf --chr 11 --out AMR_chr11
Executing: plink --bfile coriell_AMR-updated-chr

PLINK v1.90b4.4 64-bit (21 May 2017)           www.cog-genomics.org/plink/1.9/
(C) 2005-2017 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to EUR.log.
Options in effect:
  --bfile coriell_EUR
  --freq
  --out EUR

1547809 MB RAM detected; reserving 773904 MB for main workspace.
1932853 variants loaded from .bim file.
1740 people (1008 males, 732 females) loaded from .fam.
1740 phenotype values loaded from .fam.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 1740 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
Total genotyping rate is 0.998642.
--freq: Allele frequencies (founders only) written to EUR.frq .



Executing: perl HRC-1000G-check-bim.pl -b coriell_EUR.bim -f EUR.frq -r PASS.Variantsbravo-dbsnp-all.tab -h




         Script to check plink .bim files against HRC/1000G for
        strand, id names, positions, alleles, ref/alt assignment
                         William Rayner 2015
                        wrayner@well.ox.ac.uk

                             Version 4.2.5


Options Set:
Reference Panel:             HRC
Bim filename:                coriell_EUR.bim
Reference filename:          PASS.Variantsbravo-dbsnp-all.tab
Allele frequencies filename: EUR.frq
Allele frequency threshold:  0.2


Reading PASS.Variantsbravo-dbsnp-all.tab
 100000 200000 300000 400000 500000 600000 700000 800000 900000 1000000 1100000 1200000 1300000 1400000 1500000 1600000 1700000 1800000 1900000 2000000 2100000 2200000 2300000 2400000 2500000 2600000 2700000 2800000 2900000 3000000 3100000 3200000 3300000 3400000 3500000 3600000 3700000 3800000 3900000 4000000 4100000 4200000 4300000 4400000 4500000 4600000 4700000 4800000 4900000 5000000 5100000 5200000 5300000 5400000 5500000 5600000 5700000 5800000 5900000 60

Executing: sh Run-plink.sh


PLINK v1.90b4.4 64-bit (21 May 2017)           www.cog-genomics.org/plink/1.9/
(C) 2005-2017 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to TEMP1.log.
Options in effect:
  --bfile coriell_EUR
  --exclude Exclude-coriell_EUR-HRC.txt
  --make-bed
  --out TEMP1

1547809 MB RAM detected; reserving 773904 MB for main workspace.
1932853 variants loaded from .bim file.
1740 people (1008 males, 732 females) loaded from .fam.
1740 phenotype values loaded from .fam.
--exclude: 1235073 variants remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 1740 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
Total genotyping rate is 0.998709.
1235073 variants and 1740 people pass filters and QC.
Among remaining phenotypes, 1090 are

Executing: plink --bfile coriell_EUR-updated-chr1 --recode vcf --chr 1 --out EUR_chr1
Executing: plink --bfile coriell_EUR-updated-chr2 --recode vcf --chr 2 --out EUR_chr2
Executing: plink --bfile coriell_EUR-updated-chr3 --recode vcf --chr 3 --out EUR_chr3
Executing: plink --bfile coriell_EUR-updated-chr4 --recode vcf --chr 4 --out EUR_chr4
Executing: plink --bfile coriell_EUR-updated-chr5 --recode vcf --chr 5 --out EUR_chr5
Executing: plink --bfile coriell_EUR-updated-chr6 --recode vcf --chr 6 --out EUR_chr6
Executing: plink --bfile coriell_EUR-updated-chr7 --recode vcf --chr 7 --out EUR_chr7
Executing: plink --bfile coriell_EUR-updated-chr8 --recode vcf --chr 8 --out EUR_chr8
Executing: plink --bfile coriell_EUR-updated-chr9 --recode vcf --chr 9 --out EUR_chr9
Executing: plink --bfile coriell_EUR-updated-chr10 --recode vcf --chr 10 --out EUR_chr10
Executing: plink --bfile coriell_EUR-updated-chr11 --recode vcf --chr 11 --out EUR_chr11
Executing: plink --bfile coriell_EUR-updated-chr

PLINK v1.90b4.4 64-bit (21 May 2017)           www.cog-genomics.org/plink/1.9/
(C) 2005-2017 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to SAS.log.
Options in effect:
  --bfile coriell_SAS
  --freq
  --out SAS

1547809 MB RAM detected; reserving 773904 MB for main workspace.
1950640 variants loaded from .bim file.
14 people (10 males, 4 females) loaded from .fam.
14 phenotype values loaded from .fam.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 14 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
--freq: Allele frequencies (founders only) written to SAS.frq .



Executing: perl HRC-1000G-check-bim.pl -b coriell_SAS.bim -f SAS.frq -r PASS.Variantsbravo-dbsnp-all.tab -h




         Script to check plink .bim files against HRC/1000G for
        strand, id names, positions, alleles, ref/alt assignment
                         William Rayner 2015
                        wrayner@well.ox.ac.uk

                             Version 4.2.5


Options Set:
Reference Panel:             HRC
Bim filename:                coriell_SAS.bim
Reference filename:          PASS.Variantsbravo-dbsnp-all.tab
Allele frequencies filename: SAS.frq
Allele frequency threshold:  0.2


Reading PASS.Variantsbravo-dbsnp-all.tab
 100000 200000 300000 400000 500000 600000 700000 800000 900000 1000000 1100000 1200000 1300000 1400000 1500000 1600000 1700000 1800000 1900000 2000000 2100000 2200000 2300000 2400000 2500000 2600000 2700000 2800000 2900000 3000000 3100000 3200000 3300000 3400000 3500000 3600000 3700000 3800000 3900000 4000000 4100000 4200000 4300000 4400000 4500000 4600000 4700000 4800000 4900000 5000000 5100000 5200000 5300000 5400000 5500000 5600000 5700000 5800000 5900000 60

Executing: sh Run-plink.sh


PLINK v1.90b4.4 64-bit (21 May 2017)           www.cog-genomics.org/plink/1.9/
(C) 2005-2017 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to TEMP1.log.
Options in effect:
  --bfile coriell_SAS
  --exclude Exclude-coriell_SAS-HRC.txt
  --make-bed
  --out TEMP1

1547809 MB RAM detected; reserving 773904 MB for main workspace.
1950640 variants loaded from .bim file.
14 people (10 males, 4 females) loaded from .fam.
14 phenotype values loaded from .fam.
--exclude: 1200829 variants remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 14 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
1200829 variants and 14 people pass filters and QC.
Among remaining phenotypes, 12 are cases and 2 are controls.
--make-bed to TEMP1.be

Executing: plink --bfile coriell_SAS-updated-chr1 --recode vcf --chr 1 --out SAS_chr1
Executing: plink --bfile coriell_SAS-updated-chr2 --recode vcf --chr 2 --out SAS_chr2
Executing: plink --bfile coriell_SAS-updated-chr3 --recode vcf --chr 3 --out SAS_chr3
Executing: plink --bfile coriell_SAS-updated-chr4 --recode vcf --chr 4 --out SAS_chr4
Executing: plink --bfile coriell_SAS-updated-chr5 --recode vcf --chr 5 --out SAS_chr5
Executing: plink --bfile coriell_SAS-updated-chr6 --recode vcf --chr 6 --out SAS_chr6
Executing: plink --bfile coriell_SAS-updated-chr7 --recode vcf --chr 7 --out SAS_chr7
Executing: plink --bfile coriell_SAS-updated-chr8 --recode vcf --chr 8 --out SAS_chr8
Executing: plink --bfile coriell_SAS-updated-chr9 --recode vcf --chr 9 --out SAS_chr9
Executing: plink --bfile coriell_SAS-updated-chr10 --recode vcf --chr 10 --out SAS_chr10
Executing: plink --bfile coriell_SAS-updated-chr11 --recode vcf --chr 11 --out SAS_chr11
Executing: plink --bfile coriell_SAS-updated-chr

PLINK v1.90b4.4 64-bit (21 May 2017)           www.cog-genomics.org/plink/1.9/
(C) 2005-2017 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to AAC.log.
Options in effect:
  --bfile coriell_AAC
  --freq
  --out AAC

1547809 MB RAM detected; reserving 773904 MB for main workspace.
1958745 variants loaded from .bim file.
94 people (46 males, 48 females) loaded from .fam.
94 phenotype values loaded from .fam.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 94 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
Total genotyping rate is 0.998561.
--freq: Allele frequencies (founders only) written to AAC.frq .



Executing: perl HRC-1000G-check-bim.pl -b coriell_AAC.bim -f AAC.frq -r PASS.Variantsbravo-dbsnp-all.tab -h




         Script to check plink .bim files against HRC/1000G for
        strand, id names, positions, alleles, ref/alt assignment
                         William Rayner 2015
                        wrayner@well.ox.ac.uk

                             Version 4.2.5


Options Set:
Reference Panel:             HRC
Bim filename:                coriell_AAC.bim
Reference filename:          PASS.Variantsbravo-dbsnp-all.tab
Allele frequencies filename: AAC.frq
Allele frequency threshold:  0.2


Reading PASS.Variantsbravo-dbsnp-all.tab
 100000 200000 300000 400000 500000 600000 700000 800000 900000 1000000 1100000 1200000 1300000 1400000 1500000 1600000 1700000 1800000 1900000 2000000 2100000 2200000 2300000 2400000 2500000 2600000 2700000 2800000 2900000 3000000 3100000 3200000 3300000 3400000 3500000 3600000 3700000 3800000 3900000 4000000 4100000 4200000 4300000 4400000 4500000 4600000 4700000 4800000 4900000 5000000 5100000 5200000 5300000 5400000 5500000 5600000 5700000 5800000 5900000 60

Executing: sh Run-plink.sh


PLINK v1.90b4.4 64-bit (21 May 2017)           www.cog-genomics.org/plink/1.9/
(C) 2005-2017 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to TEMP1.log.
Options in effect:
  --bfile coriell_AAC
  --exclude Exclude-coriell_AAC-HRC.txt
  --make-bed
  --out TEMP1

1547809 MB RAM detected; reserving 773904 MB for main workspace.
1958745 variants loaded from .bim file.
94 people (46 males, 48 females) loaded from .fam.
94 phenotype values loaded from .fam.
--exclude: 1235788 variants remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 94 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
Total genotyping rate is 0.99865.
1235788 variants and 94 people pass filters and QC.
Among remaining phenotypes, 39 are cases and 55 

Executing: plink --bfile coriell_AAC-updated-chr1 --recode vcf --chr 1 --out AAC_chr1
Executing: plink --bfile coriell_AAC-updated-chr2 --recode vcf --chr 2 --out AAC_chr2
Executing: plink --bfile coriell_AAC-updated-chr3 --recode vcf --chr 3 --out AAC_chr3
Executing: plink --bfile coriell_AAC-updated-chr4 --recode vcf --chr 4 --out AAC_chr4
Executing: plink --bfile coriell_AAC-updated-chr5 --recode vcf --chr 5 --out AAC_chr5
Executing: plink --bfile coriell_AAC-updated-chr6 --recode vcf --chr 6 --out AAC_chr6
Executing: plink --bfile coriell_AAC-updated-chr7 --recode vcf --chr 7 --out AAC_chr7
Executing: plink --bfile coriell_AAC-updated-chr8 --recode vcf --chr 8 --out AAC_chr8
Executing: plink --bfile coriell_AAC-updated-chr9 --recode vcf --chr 9 --out AAC_chr9
Executing: plink --bfile coriell_AAC-updated-chr10 --recode vcf --chr 10 --out AAC_chr10
Executing: plink --bfile coriell_AAC-updated-chr11 --recode vcf --chr 11 --out AAC_chr11
Executing: plink --bfile coriell_AAC-updated-chr

In [None]:
!swarm -f {swarm_scripts_dir}/run_imputation_pipeline.swarm -g 20 -t 16 --time=80:00:00 --logdir swarm --gres=lscratch:20 --partition=norm

In [10]:
ref_panel='/data/vitaled2/GenoTools/ref/PASS.Variantsbravo-dbsnp-all.tab'
check_bim_pl = '/data/vitaled2/GenoTools/ref/HRC-1000G-check-bim.pl'
imputed_out_dir = f'{out_genotypes}/coriell/imputed' 
rerun_geno = '/data/CARD/PD/GP2/genotypes/coriell/clean/coriell_AAC'
rerun_label = 'AAC'

with open(f'{swarm_scripts_dir}/rerun_imputation_pipeline.swarm','w') as f:

    label_temp_outdir = f'{basedir}/coriell/bed/{rerun_label}'
    label_temp = f'{label_temp_outdir}/{rerun_label}'
    label_outdir = f'{imputed_out_dir}/{rerun_label}'
    os.makedirs(f'{label_outdir}', exist_ok=True)
    os.makedirs(f'{label_temp_outdir}', exist_ok=True)
    impute_data = impute_data_prep(rerun_geno, label_temp, ref_panel, check_bim_pl)
    run_pipeline = f'python3 ../run_imputation_pipeline.py --geno {label_temp} --token {config.api_key} --out {label_outdir}/'
#         run_pipeline = f'python3 ../run_imputation_pipeline.py --geno {geno} --temp {label_temp} --token {config.api_key} --ref_panel {ref_panel} --check_bim_pl {check_bim_pl} --out {label_outdir}/'
    f.write(f'{run_pipeline}\n')
        
f.close()

!cat {swarm_scripts_dir}/rerun_imputation_pipeline.swarm

Executing: plink --bfile coriell_AAC --freq --out AAC


PLINK v1.90b4.4 64-bit (21 May 2017)           www.cog-genomics.org/plink/1.9/
(C) 2005-2017 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to AAC.log.
Options in effect:
  --bfile coriell_AAC
  --freq
  --out AAC

1547809 MB RAM detected; reserving 773904 MB for main workspace.
1958745 variants loaded from .bim file.
94 people (46 males, 48 females) loaded from .fam.
94 phenotype values loaded from .fam.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 94 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
Total genotyping rate is 0.998561.
--freq: Allele frequencies (founders only) written to AAC.frq .



Executing: perl HRC-1000G-check-bim.pl -b coriell_AAC.bim -f AAC.frq -r PASS.Variantsbravo-dbsnp-all.tab -h




         Script to check plink .bim files against HRC/1000G for
        strand, id names, positions, alleles, ref/alt assignment
                         William Rayner 2015
                        wrayner@well.ox.ac.uk

                             Version 4.2.5


Options Set:
Reference Panel:             HRC
Bim filename:                coriell_AAC.bim
Reference filename:          PASS.Variantsbravo-dbsnp-all.tab
Allele frequencies filename: AAC.frq
Allele frequency threshold:  0.2


Reading PASS.Variantsbravo-dbsnp-all.tab
 100000 200000 300000 400000 500000 600000 700000 800000 900000 1000000 1100000 1200000 1300000 1400000 1500000 1600000 1700000 1800000 1900000 2000000 2100000 2200000 2300000 2400000 2500000 2600000 2700000 2800000 2900000 3000000 3100000 3200000 3300000 3400000 3500000 3600000 3700000 3800000 3900000 4000000 4100000 4200000 4300000 4400000 4500000 4600000 4700000 4800000 4900000 5000000 5100000 5200000 5300000 5400000 5500000 5600000 5700000 5800000 5900000 60

Executing: sh Run-plink.sh


PLINK v1.90b4.4 64-bit (21 May 2017)           www.cog-genomics.org/plink/1.9/
(C) 2005-2017 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to TEMP1.log.
Options in effect:
  --bfile coriell_AAC
  --exclude Exclude-coriell_AAC-HRC.txt
  --make-bed
  --out TEMP1

1547809 MB RAM detected; reserving 773904 MB for main workspace.
1958745 variants loaded from .bim file.
94 people (46 males, 48 females) loaded from .fam.
94 phenotype values loaded from .fam.
--exclude: 1235788 variants remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 94 founders and 0 nonfounders present.
Calculating allele frequencies... 10111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989 done.
Total genotyping rate is 0.99865.
1235788 variants and 94 people pass filters and QC.
Among remaining phenotypes, 39 are cases and 55 

Executing: plink --bfile coriell_AAC-updated-chr1 --recode vcf --chr 1 --out AAC_chr1
Executing: plink --bfile coriell_AAC-updated-chr2 --recode vcf --chr 2 --out AAC_chr2
Executing: plink --bfile coriell_AAC-updated-chr3 --recode vcf --chr 3 --out AAC_chr3
Executing: plink --bfile coriell_AAC-updated-chr4 --recode vcf --chr 4 --out AAC_chr4
Executing: plink --bfile coriell_AAC-updated-chr5 --recode vcf --chr 5 --out AAC_chr5
Executing: plink --bfile coriell_AAC-updated-chr6 --recode vcf --chr 6 --out AAC_chr6
Executing: plink --bfile coriell_AAC-updated-chr7 --recode vcf --chr 7 --out AAC_chr7
Executing: plink --bfile coriell_AAC-updated-chr8 --recode vcf --chr 8 --out AAC_chr8
Executing: plink --bfile coriell_AAC-updated-chr9 --recode vcf --chr 9 --out AAC_chr9
Executing: plink --bfile coriell_AAC-updated-chr10 --recode vcf --chr 10 --out AAC_chr10
Executing: plink --bfile coriell_AAC-updated-chr11 --recode vcf --chr 11 --out AAC_chr11
Executing: plink --bfile coriell_AAC-updated-chr

python3 ../run_imputation_pipeline.py --geno /data/CARD/PD/GP2/raw_genotypes/coriell/bed/AAC/AAC --token eyJjdHkiOiJ0ZXh0XC9wbGFpbiIsImFsZyI6IkhTMjU2In0.eyJtYWlsIjoidml0YWxlZDJAbmloLmdvdiIsImV4cGlyZSI6MTYyNjQwNjczMzc5OCwibmFtZSI6IkRhbiBWaXRhbGUiLCJhcGkiOnRydWUsInVzZXJuYW1lIjoidml0YWxlZDIifQ.hcHyBgJmcTZDEpFnb8t5gH1lfxSZQZHC4Lu9IhN0E18 --out /data/CARD/PD/GP2/genotypes/coriell/imputed/AAC/


In [11]:
!swarm -f {swarm_scripts_dir}/rerun_imputation_pipeline.swarm -g 20 -t 16 --time=80:00:00 --logdir swarm --gres=lscratch:20 --partition=norm

18178627


['EAS', 'AJ', 'AMR', 'EUR', 'SAS', 'AAC']

In [19]:
# merge AAC with ref panel 

from QC.utils import merge_genos

aac_geno = '/data/CARD/PD/GP2/raw_genotypes/coriell/plink/coriell_callrate_sex_ancestry_AAC_related_het_variant'
out_dir = '/data/CARD/PD/GP2/genotypes/coriell/clean'
ref_dir_path = '/data/LNG/vitaled2/1kgenomes'
ref_panel = f'{ref_dir_path}/1kg_ashkj_ref_panel_gp2_pruned'

merged_ref_geno = f'{aac_geno}_merged_ref'



In [12]:
geno_ancestry_prune1 = f'{aac_geno}_ancestry_prune1'
geno_ancestry_prune2 = f'{aac_geno}_ancestry_prune2'
# geno_ancestry_prune3 = f'{geno_het}_ancestry_prune3'

# prune geno_het for geno, maf, hwe, and palindromes
bim = pd.read_csv(f'{aac_geno}.bim', sep='\t', header=None)

# find and drop palindromes in geno_het bim file
bim.columns = ['chr', 'rsid', 'kb', 'pos', 'a1', 'a2']
palindromes = bim.loc[((bim.a1 == 'A') & (bim.a2 == 'T')) | ((bim.a1 == 'T') & (bim.a2 == 'A')) | ((bim.a1 == 'C') & (bim.a2 == 'G')) | ((bim.a1 == 'G') & (bim.a2 == 'C'))]
palindromes['rsid'].to_csv(f'{aac_geno}_palindromes.snplist', header=False, index=False, sep='\t')

plink_cmd1 = f'plink --bfile {aac_geno}\
 --maf 0.05\
 --geno 0.01\
 --hwe 0.0001\
 --autosome\
 --allow-no-sex\
 --exclude {aac_geno}_palindromes.snplist\
 --make-bed\
 --out {geno_ancestry_prune1}' 

#remove high-LD regions
plink_cmd2 = f'plink --bfile {geno_ancestry_prune1}\
 --exclude range {ref_dir_path}/hg19_exclusion_regions.txt\
 --autosome\
 --allow-no-sex\
 --make-bed\
 --out {geno_ancestry_prune2}'

cmds = [plink_cmd1, plink_cmd2]

for cmd in cmds:
    shell_do(cmd)

Executing: plink --bfile /data/CARD/PD/GP2/raw_genotypes/coriell/plink/coriell_callrate_sex_ancestry_AAC_related_het_variant --maf 0.05 --geno 0.01 --hwe 0.0001 --autosome --allow-no-sex --exclude /data/CARD/PD/GP2/raw_genotypes/coriell/plink/coriell_callrate_sex_ancestry_AAC_related_het_variant_palindromes.snplist --make-bed --out /data/CARD/PD/GP2/raw_genotypes/coriell/plink/coriell_callrate_sex_ancestry_AAC_related_het_variant_ancestry_prune1
Executing: plink --bfile /data/CARD/PD/GP2/raw_genotypes/coriell/plink/coriell_callrate_sex_ancestry_AAC_related_het_variant_ancestry_prune1 --exclude range /data/LNG/vitaled2/1kgenomes/hg19_exclusion_regions.txt --autosome --allow-no-sex --make-bed --out /data/CARD/PD/GP2/raw_genotypes/coriell/plink/coriell_callrate_sex_ancestry_AAC_related_het_variant_ancestry_prune2


In [32]:
# now get common snps between pruned ref panel and pruned geno
geno_pruned_bim = pd.read_csv(f'{geno_ancestry_prune2}.bim', sep='\t', header=None)
geno_pruned_bim.columns = ['chr', 'rsid', 'kb', 'pos', 'a1', 'a2']
ref_bim = pd.read_csv(f'{ref_panel}.bim', sep='\t', header=None)
ref_bim.columns = ['chr', 'rsid', 'kb', 'pos', 'a1', 'a2']

common_snps = ref_bim.merge(geno_pruned_bim, how='inner', on=['rsid'])

common_snps['rsid'].to_csv(f'{out_dir}/aac_geno_ref_panel_common.snps', sep='\t', header=False, index=False)

In [38]:
ref_panel_common_snps_geno_out = f'{out_dir}/aac_geno_ref_panel_common_snps'
ref_panel_common_snps = f'{out_dir}/aac_geno_ref_panel_common.snps'

ext_snps_cmd = f'plink --bfile {ref_panel} --extract {ref_panel_common_snps} --make-bed --out {ref_panel_common_snps_geno_out}'
shell_do(ext_snps_cmd)

Executing: plink --bfile /data/LNG/vitaled2/1kgenomes/1kg_ashkj_ref_panel_gp2_pruned --extract /data/CARD/PD/GP2/genotypes/coriell/clean/aac_geno_ref_panel_common.snps --make-bed --out /data/CARD/PD/GP2/genotypes/coriell/clean/aac_geno_ref_panel_common_snps


In [40]:
# get reference alleles from ref_panel_common_snps
ref_panel_common_snps_ref_alleles = f'{ref_panel_common_snps_geno_out}.ref_allele'
ref_panel_common_snps_bim = pd.read_csv(f'{ref_panel_common_snps_geno_out}.bim', header=None, sep='\t')
ref_panel_common_snps_bim.columns = ['chr', 'rsid', 'kb', 'pos', 'a1', 'a2']
ref_panel_common_snps_bim[['rsid','a1']].to_csv(ref_panel_common_snps_ref_alleles, sep='\t', header=False, index=False)

In [42]:
geno_common_snps = f'{geno_ancestry_prune2}_common_snps'

ext_snps_cmd = f'plink --bfile {geno_ancestry_prune2} --extract {ref_panel_common_snps} --reference-allele {ref_panel_common_snps_ref_alleles} --make-bed --out {geno_common_snps}'
shell_do(ext_snps_cmd)

Executing: plink --bfile /data/CARD/PD/GP2/raw_genotypes/coriell/plink/coriell_callrate_sex_ancestry_AAC_related_het_variant_ancestry_prune2 --extract /data/CARD/PD/GP2/genotypes/coriell/clean/aac_geno_ref_panel_common.snps --reference-allele /data/CARD/PD/GP2/genotypes/coriell/clean/aac_geno_ref_panel_common_snps.ref_allele --make-bed --out /data/CARD/PD/GP2/raw_genotypes/coriell/plink/coriell_callrate_sex_ancestry_AAC_related_het_variant_ancestry_prune2_common_snps


In [45]:
merged_ref_geno = f'{geno_ancestry_prune2}_merged_ref'

merge_genos(geno_common_snps, ref_panel_common_snps_geno_out, merged_ref_geno)

Executing: plink --bfile /data/CARD/PD/GP2/raw_genotypes/coriell/plink/coriell_callrate_sex_ancestry_AAC_related_het_variant_ancestry_prune2_common_snps --allow-no-sex --bmerge /data/CARD/PD/GP2/genotypes/coriell/clean/aac_geno_ref_panel_common_snps --out /data/CARD/PD/GP2/raw_genotypes/coriell/plink/coriell_callrate_sex_ancestry_AAC_related_het_variant_ancestry_prune2_merged_ref --make-bed
Executing: plink --bfile /data/CARD/PD/GP2/raw_genotypes/coriell/plink/coriell_callrate_sex_ancestry_AAC_related_het_variant_ancestry_prune2_common_snps --allow-no-sex --flip /data/CARD/PD/GP2/raw_genotypes/coriell/plink/coriell_callrate_sex_ancestry_AAC_related_het_variant_ancestry_prune2_merged_ref-merge.missnp --make-bed --out /data/CARD/PD/GP2/raw_genotypes/coriell/plink/coriell_callrate_sex_ancestry_AAC_related_het_variant_ancestry_prune2_common_snps_flip
Executing: plink --bfile /data/CARD/PD/GP2/raw_genotypes/coriell/plink/coriell_callrate_sex_ancestry_AAC_related_het_variant_ancestry_prune2_

In [55]:
ancestry = pd.read_csv('/data/LNG/vitaled2/1kgenomes/ref_panel_ancestry.txt', sep='\t', header=None, names=['FID','IID','label'])
ref_fam = pd.read_csv(f'{ref_panel}.fam', sep=' ', header=None)
ref_labeled = ref_fam.merge(ancestry, how='left', left_on=[0,1], right_on=['FID','IID'])

In [76]:
geno_common_snps_df = pd.read_csv(f'{geno_common_snps}.fam', header=None, sep='\s+', usecols=[0,1], names=['FID','IID'])
geno_common_snps_df.loc[:,'label'] = 'new'
fs_labels = geno_common_snps_df.append(ref_labeled.loc[:,['FID','IID','label']])
fs_labels.to_csv(f'{merged_ref_geno}.labels', sep='\t', index=False)

In [77]:

ancestry_labels = f'{merged_ref_geno}.labels'
structure_out = f'{merged_ref_geno}_structure'
# fam = pd.read_csv(f'{ref_panel_prune_final}.fam', sep=' ', header=None)
structure = f'/data/vitaled2/ref_panel/fastStructure/structure.py'


# run for k=1-8 and use chooseK.py to select best
fs_swarm_script = f'{out_dir}/faststructure_choosek.swarm'
with open(fs_swarm_script, 'w') as f:
    k=8
    fs_cmd = f'bash /data/vitaled2/GWAS/gwas/faststructure_setup_and_run.sh -i {merged_ref_geno} -o {structure_out} -f {structure} -k {k}'
    f.write(f'{fs_cmd}\n')
f.close()

In [79]:
shell_do(f'swarm -f {fs_swarm_script} -g 16 --time=10:00:00 -t 20 --logdir {ref_dir_path}/swarm --partition=norm')

Executing: swarm -f /data/CARD/PD/GP2/genotypes/coriell/clean/faststructure_choosek.swarm -g 16 --time=10:00:00 -t 20 --logdir /data/LNG/vitaled2/1kgenomes/swarm --partition=norm


In [81]:
fam = pd.read_csv(f'{merged_ref_geno}.fam', sep=' ', header=None)
pop = pd.read_csv(f'{merged_ref_geno}.labels', sep='\t')
q_df = pd.read_csv(f'{structure_out}.8.meanQ', header=None, sep='\s+')
q_df.columns = [f'pop{i}' for i in range(len(q_df.columns))]

In [87]:
q_df['FID'], q_df['IID'] = fam[0], fam[1]
q_pop_merged = q_df.merge(pop, left_on=['FID','IID'], right_on=['FID','IID'])

In [91]:
q_pop_merged.to_csv(f'{out_dir}/aac_samples_labeled_faststructure.txt', sep='\t', header=True, index=False)

In [93]:
f'{out_dir}/aac_samples_labeled_faststructure.txt'

'/data/CARD/PD/GP2/genotypes/coriell/clean/aac_samples_labeled_faststructure.txt'

0.8523977872340425