In [1]:
import pandas as pd
import os
import shutil
import numpy as np
import glob
import subprocess

from QC.utils import shell_do
import QC.config as config
from QC.imputation import impute_data_prep

In [2]:
basedir = '/data/CARD/PD/GP2/raw_genotypes'
out_genotypes = '/data/CARD/PD/GP2/genotypes'
cohort_path = f'{basedir}/coriell'
sample_info_path = f'{cohort_path}/sample_info'
ilmn_files_path = '/data/CARD/PD/GP2/ilmn_files'
bpm = f'{ilmn_files_path}/NeuroBooster_20042459_A1.bpm'
egt = f'{ilmn_files_path}/NBSCluster_file_n1393_011921.egt'
# key_file = f'{sample_info_path}/KEY_FILE_GP2_Coriell_P1_25_042621_B1.txt'
key_file = f'{sample_info_path}/coriell_total_key.txt'
pheno_file = f'{sample_info_path}/CORIELL_20210201_clinical.csv'

ped_dir = f'{cohort_path}/ped'
bed_dir = f'{cohort_path}/bed'
swarm_scripts_dir = f'{cohort_path}/swarm_scripts'
idat_dir = f'{cohort_path}/idats'
raw_idat_file_path = f'{cohort_path}/idats/GP2_coriell_idats'

# create ped and plink directories for raw geno outputs if they don't exist
os.makedirs(ped_dir, exist_ok=True)
os.makedirs(bed_dir, exist_ok=True)
os.makedirs(f'{bed_dir}/indiv_samples', exist_ok=True)
os.makedirs(swarm_scripts_dir, exist_ok=True)
os.makedirs(idat_dir, exist_ok=True)

iaap = f'{ilmn_files_path}/iaap-cli/iaap-cli'

In [29]:
# fix sidransky ids
# om = pd.read_csv(f'{sample_info_path}/GP2_Sidransky_GD_cohort_original_manifest.csv')
# om
# ids = pd.read_csv(f'{sample_info_path}/GP2_BATCH3_Coriell_P26_43_Sidransky_772021.csv')
# for x in ids.Sample_ID.str.replace('\W', '_', regex=True):
#     print(x)
# for x in pheno.Original_clinicalID:
#     print(x)

In [47]:
key

Unnamed: 0,Sample_ID,SentrixBarcode_A,SentrixPosition_A,filename
0,ND06295,204701860088,R01C01,204701860088_R01C01
1,ND06869,204701860088,R02C01,204701860088_R02C01
2,ND06634,204701860088,R03C01,204701860088_R03C01
3,ND06354,204701860088,R04C01,204701860088_R04C01
4,ND03497,204701860088,R05C01,204701860088_R05C01
...,...,...,...,...
4311,SidranskyGD_103,205275450145,R04C01,205275450145_R04C01
4312,SidranskyGD_105,205275450145,R05C01,205275450145_R05C01
4313,SidranskyGD_1002,205275450145,R06C01,205275450145_R06C01
4314,SidranskyGD_1005,205275450145,R07C01,205275450145_R07C01


In [21]:
pheno = pd.read_csv(pheno_file)
key = pd.read_csv(key_file, sep='\t')


pheno['Original_clinicalID'] = pheno['Original_clinicalID'].astype(str)
key['Sample_ID'] = key['Sample_ID'].astype(str)
key['filename'] = key['SentrixBarcode_A'].astype(str) + '_' + key['SentrixPosition_A']
pheno_out = key.merge(pheno, how='inner', left_on='Sample_ID', right_on='Original_clinicalID')
pheno_out['IID'] = pheno_out.SentrixBarcode_A.astype(str) + '_' + pheno_out.SentrixPosition_A.astype(str)
pheno_out['FID'] = 0
pheno_out['FID_new'] = 0
pheno_out['pheno'] = 0
pheno_out.loc[pheno_out.Phenotype == 'PD', 'pheno'] = 2
pheno_out.loc[pheno_out.Phenotype == 'Control', 'pheno'] = 1
pheno_out.loc[pheno_out.Phenotype == np.nan, 'pheno'] = 0

pheno_out[['FID','IID', 'FID_new', 'Sample_ID']].to_csv(f'{sample_info_path}/update_ids.txt', sep='\t', header=False, index=False)
pheno_out[['FID_new', 'Sample_ID', 'pheno']].to_csv(f'{sample_info_path}/update_pheno.txt', sep='\t', header=False, index=False)

In [37]:
pheno_out['pheno'].value_counts()

2    2777
1    1251
Name: pheno, dtype: int64

In [37]:
# create a folder in idats for each plate in new idat_dir
for code in key.SentrixBarcode_A.unique():
    os.makedirs(f'{idat_dir}/{code}', exist_ok=True)

# copy idat intensity files to respective directories under idat_dir
missing_idats = []

for i, filename in enumerate(key.filename):
    sentrix_code = key.SentrixBarcode_A.iloc[i]
    grn = f'{raw_idat_file_path}/{filename}_Grn.idat'
    red = f'{raw_idat_file_path}/{filename}_Red.idat'

    if os.path.isfile(grn):
        shutil.copyfile(src=grn, dst=f'{idat_dir}/{sentrix_code}/{filename}_Grn.idat')
    else:
        missing_idats.append(grn)

    if os.path.isfile(red):
        shutil.copyfile(src=red, dst=f'{idat_dir}/{sentrix_code}/{filename}_Red.idat')
    else:
        missing_idats.append(red)

len(missing_idats)

2

In [39]:
with open(f'{swarm_scripts_dir}/idat_to_ped.swarm', 'w') as f:
    
    for code in key.SentrixBarcode_A.unique():
        
        idat_to_ped_cmd = f'\
{iaap} gencall \
{bpm} \
{egt} \
{ped_dir}/ \
-f {idat_dir}/{code} \
-p \
-t 8'
        
        f.write(f'{idat_to_ped_cmd}\n')
f.close()


In [40]:
!swarm -f {swarm_scripts_dir}/idat_to_ped.swarm -g 32 -t 16 --time=10:00:00 --logdir {swarm_scripts_dir}/logs --gres=lscratch:20 --partition=norm

19399969


In [43]:
# copy map file to match name of each ped
map_file = f'{ped_dir}/NeuroBooster_20042459_A1.map'
for filename in key.filename:
    ped = f'{ped_dir}/{filename}.ped'
    out_map = f'{ped_dir}/{filename}.map'
    if os.path.isfile(ped):
        shutil.copyfile(src=map_file, dst=out_map)
    else:
        print(f'{ped} does not exist!')
        print(f'{out_map} creation cancelled')

/data/CARD/PD/GP2/raw_genotypes/coriell/ped/205275450156_R02C01.ped does not exist!
/data/CARD/PD/GP2/raw_genotypes/coriell/ped/205275450156_R02C01.map creation cancelled


In [44]:

with open(f'{swarm_scripts_dir}/make_bed.swarm', 'w') as f:
    for filename in key.filename:
        ped = f'{ped_dir}/{filename}'
        make_bed_cmd = f'\
plink \
--file {ped} \
--make-bed \
--out {bed_dir}/indiv_samples/{filename}'

        f.write(f'{make_bed_cmd}\n')
f.close()


In [45]:
!swarm -f {swarm_scripts_dir}/make_bed.swarm -g 64 -t 32 --time=10:00:00 --logdir swarm --gres=lscratch:20 --partition=norm

19401925


In [53]:
# write plink merge command
with open(f"{bed_dir}/merge_bed.list", 'w') as f:
    for filename in key.filename:
        bed = f'{bed_dir}/indiv_samples/{filename}'
        if os.path.isfile(f'{bed}.bed'):
            f.write(f'{bed}\n')
        else:
            print(f'{bed} does not exist!!!')
f.close()

with open(f"{swarm_scripts_dir}/merge.swarm", 'w') as f:

    plink_merge_cmd = f'\
plink \
--merge-list {bed_dir}/merge_bed.list \
--update-ids {sample_info_path}/update_ids.txt \
--make-bed \
--out {bed_dir}/indiv_samples/coriell_merge'
    f.write(f"{plink_merge_cmd}")
f.close()

/data/CARD/PD/GP2/raw_genotypes/coriell/bed/indiv_samples/205275450156_R02C01 does not exist!!!


In [54]:
!swarm -f {swarm_scripts_dir}/merge.swarm -g 64 -t 32 --time=10:00:00 --logdir swarm --gres=lscratch:20 --partition=norm

19403938


In [3]:
!cat swarm/swarm_19403938_0.o

---- COMMAND EXECUTED: ---------------------------------------------------------
(   plink --merge-list /data/CARD/PD/GP2/raw_genotypes/coriell/bed/merge_bed.list --update-ids /data/CARD/PD/GP2/raw_genotypes/coriell/sample_info/update_ids.txt --make-bed --out /data/CARD/PD/GP2/raw_genotypes/coriell/bed/indiv_samples/coriell_merge )
--------------------------------------------------------------------------------
PLINK v1.90b4.4 64-bit (21 May 2017)           www.cog-genomics.org/plink/1.9/
(C) 2005-2017 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to /data/CARD/PD/GP2/raw_genotypes/coriell/bed/indiv_samples/coriell_merge.log.
Options in effect:
  --make-bed
  --merge-list /data/CARD/PD/GP2/raw_genotypes/coriell/bed/merge_bed.list
  --out /data/CARD/PD/GP2/raw_genotypes/coriell/bed/indiv_samples/coriell_merge
  --update-ids /data/CARD/PD/GP2/raw_genotypes/coriell/sample_info/update_ids.txt

257652 MB RAM detected; reserving 128826 MB for main workspace.
Perfor

In [4]:
!plink --bfile {bed_dir}/indiv_samples/coriell_merge --pheno {sample_info_path}/update_pheno.txt --make-bed --out {bed_dir}/coriell

PLINK v1.90b4.4 64-bit (21 May 2017)           www.cog-genomics.org/plink/1.9/
(C) 2005-2017 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to /data/CARD/PD/GP2/raw_genotypes/coriell/bed/coriell.log.
Options in effect:
  --bfile /data/CARD/PD/GP2/raw_genotypes/coriell/bed/indiv_samples/coriell_merge
  --make-bed
  --out /data/CARD/PD/GP2/raw_genotypes/coriell/bed/coriell
  --pheno /data/CARD/PD/GP2/raw_genotypes/coriell/sample_info/update_pheno.txt

386449 MB RAM detected; reserving 193224 MB for main workspace.
2004347 variants loaded from .bim file.
4315 people (2392 males, 1749 females, 174 ambiguous) loaded from .fam.
Ambiguous sex IDs written to
/data/CARD/PD/GP2/raw_genotypes/coriell/bed/coriell.nosex .
4027 phenotype values present after --pheno.
phenotypes to be ignored, use the --allow-no-sex flag.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 4315 founders and 0 nonfounders present.
Calculating allele frequencie

In [3]:
# run QC pipeline
geno_path = f'{basedir}/coriell/bed/coriell'
out_dir = f'{out_genotypes}/coriell/clean'
out_path = f'{out_dir}/coriell'
os.makedirs(f'{out_dir}', exist_ok=True)
ref_dir_path = '/data/LNG/vitaled2/1kgenomes'
ref_panel = f'{ref_dir_path}/1kg_ashkj_ref_panel_gp2_pruned'
ref_labels = f'{ref_dir_path}/ref_panel_ancestry.txt'


with open(f'{swarm_scripts_dir}/run_qc_pipeline.swarm','w') as f:
    run_pipeline = f'python3 ../run_qc_pipeline.py --geno {geno_path} --ref {ref_panel} --ref_labels {ref_labels} --out {out_path}'
    f.write(f'{run_pipeline}\n')
f.close()
!cat {swarm_scripts_dir}/run_qc_pipeline.swarm

python3 ../run_qc_pipeline.py --geno /data/CARD/PD/GP2/raw_genotypes/coriell/bed/coriell --ref /data/LNG/vitaled2/1kgenomes/1kg_ashkj_ref_panel_gp2_pruned --ref_labels /data/LNG/vitaled2/1kgenomes/ref_panel_ancestry.txt --out /data/CARD/PD/GP2/genotypes/coriell/clean/coriell


In [6]:
!swarm -f {swarm_scripts_dir}/run_qc_pipeline.swarm -g 64 -t 32 --time=10:00:00 --logdir swarm --gres=lscratch:20 --partition=norm

19417202


In [35]:
eur_fam =  pd.read_csv(f'{out_dir}/coriell_EUR.fam', sep='\s+', header=None, names=['FID','IID','pat','mat','SEX','PHENO'])
eur_fam

Unnamed: 0,FID,IID,pat,mat,SEX,PHENO
0,0,ND05660,0,0,1,2
1,0,ND05661,0,0,2,2
2,0,ND06151,0,0,1,2
3,0,ND06285,0,0,1,2
4,0,ND06399,0,0,2,2
...,...,...,...,...,...,...
3122,0,ND23831,0,0,1,2
3123,0,ND23833,0,0,2,2
3124,0,ND23844,0,0,1,1
3125,0,ND23836,0,0,1,2


In [44]:
eur_out = eur_fam[eur_fam.IID.isin(pheno_out.Sample_ID)]
eur_out[['FID','IID']].to_csv(f'{out_dir}/coriell_ONLY_EUR.samples', sep='\t',header=False, index=False)

In [45]:
!plink --bfile {out_dir}/coriell_EUR --keep {out_dir}/coriell_ONLY_EUR.samples --make-bed --out {out_dir}/coriell_FINAL_EUR

PLINK v1.90b4.4 64-bit (21 May 2017)           www.cog-genomics.org/plink/1.9/
(C) 2005-2017 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to /data/CARD/PD/GP2/genotypes/coriell/clean/coriell_FINAL_EUR.log.
Options in effect:
  --bfile /data/CARD/PD/GP2/genotypes/coriell/clean/coriell_EUR
  --keep /data/CARD/PD/GP2/genotypes/coriell/clean/coriell_ONLY_EUR.samples
  --make-bed
  --out /data/CARD/PD/GP2/genotypes/coriell/clean/coriell_FINAL_EUR

1547809 MB RAM detected; reserving 773904 MB for main workspace.
1902953 variants loaded from .bim file.
3127 people (1791 males, 1336 females) loaded from .fam.
3029 phenotype values loaded from .fam.
--keep: 3029 people remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 3029 founders and 0 nonfounders present.
Calculating allele frequencies... 101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475

In [53]:
!cat {out_dir}/coriell_EUR.fam | wc -l
!ls {out_dir}

3127
coriell_AAC.bed  coriell_AMR.hh		coriell_FINAL_EUR.bim
coriell_AAC.bim  coriell_AMR.log	coriell_FINAL_EUR.fam
coriell_AAC.fam  coriell_EAS.bed	coriell_FINAL_EUR.hh
coriell_AAC.hh	 coriell_EAS.bim	coriell_FINAL_EUR.log
coriell_AAC.log  coriell_EAS.fam	coriell_ONLY_EUR.samples
coriell_AJ.bed	 coriell_EAS.hh		coriell.QC.metrics.h5
coriell_AJ.bim	 coriell_EAS.log	coriell_SAS.bed
coriell_AJ.fam	 coriell_EUR.bed	coriell_SAS.bim
coriell_AJ.hh	 coriell_EUR.bim	coriell_SAS.fam
coriell_AJ.log	 coriell_EUR.fam	coriell_SAS.hh
coriell_AMR.bed  coriell_EUR.hh		coriell_SAS.log
coriell_AMR.bim  coriell_EUR.log
coriell_AMR.fam  coriell_FINAL_EUR.bed


In [11]:
# run imputation

# these files need to be in a different place eventually
ref_panel='/data/vitaled2/GenoTools/ref/PASS.Variantsbravo-dbsnp-all.tab'
check_bim_pl = '/data/vitaled2/GenoTools/ref/HRC-1000G-check-bim.pl'

imputed_out_dir = f'{out_genotypes}/coriell/imputed' 
os.makedirs(imputed_out_dir, exist_ok=True)

# jenky method for pulling cleaned genos with ancestry labels for imputation... should figure out how to do this better later
impute_genos_list = [x.split('.')[0] for x in glob.glob(f'{out_dir}/*.bed')]

impute_labels_list = [x.split('/')[-1].replace('.bed','').split('_')[-1] for x in impute_genos_list]

with open(f'{swarm_scripts_dir}/run_imputation_pipeline.swarm','w') as f:
    for geno, label in zip(impute_genos_list, impute_labels_list):

        label_temp_outdir = f'{basedir}/coriell/bed/{label}'
        label_temp = f'{label_temp_outdir}/{label}'
        label_outdir = f'{imputed_out_dir}/{label}'
        os.makedirs(f'{label_outdir}', exist_ok=True)
        os.makedirs(f'{label_temp_outdir}', exist_ok=True)
#         impute_data = impute_data_prep(geno, label_temp, ref_panel, check_bim_pl)
        run_pipeline = f'proxyon; python3 ../run_imputation_pipeline.py --geno {label_temp} --token {config.api_key} --out {label_outdir}/'
#         run_pipeline = f'python3 ../run_imputation_pipeline.py --geno {geno} --temp {label_temp} --token {config.api_key} --ref_panel {ref_panel} --check_bim_pl {check_bim_pl} --out {label_outdir}/'
        f.write(f'{run_pipeline}\n')
        
f.close()


    
# !cat {swarm_scripts_dir}/run_imputation_pipeline.swarm

In [12]:
!swarm -f {swarm_scripts_dir}/run_imputation_pipeline.swarm -g 20 -t 16 --time=80:00:00 --logdir swarm --gres=lscratch:20 --partition=norm

20842629


In [14]:
# !cat swarm/swarm_20842629_0.e
!cat {swarm_scripts_dir}/run_imputation_pipeline.swarm

proxyon; python3 ../run_imputation_pipeline.py --geno /data/CARD/PD/GP2/raw_genotypes/coriell/bed/EAS/EAS --token eyJjdHkiOiJ0ZXh0XC9wbGFpbiIsImFsZyI6IkhTMjU2In0.eyJtYWlsIjoidml0YWxlZDJAbmloLmdvdiIsImV4cGlyZSI6MTYyNjQwNjczMzc5OCwibmFtZSI6IkRhbiBWaXRhbGUiLCJhcGkiOnRydWUsInVzZXJuYW1lIjoidml0YWxlZDIifQ.hcHyBgJmcTZDEpFnb8t5gH1lfxSZQZHC4Lu9IhN0E18 --out /data/CARD/PD/GP2/genotypes/coriell/imputed/EAS/
proxyon; python3 ../run_imputation_pipeline.py --geno /data/CARD/PD/GP2/raw_genotypes/coriell/bed/AJ/AJ --token eyJjdHkiOiJ0ZXh0XC9wbGFpbiIsImFsZyI6IkhTMjU2In0.eyJtYWlsIjoidml0YWxlZDJAbmloLmdvdiIsImV4cGlyZSI6MTYyNjQwNjczMzc5OCwibmFtZSI6IkRhbiBWaXRhbGUiLCJhcGkiOnRydWUsInVzZXJuYW1lIjoidml0YWxlZDIifQ.hcHyBgJmcTZDEpFnb8t5gH1lfxSZQZHC4Lu9IhN0E18 --out /data/CARD/PD/GP2/genotypes/coriell/imputed/AJ/
proxyon; python3 ../run_imputation_pipeline.py --geno /data/CARD/PD/GP2/raw_genotypes/coriell/bed/AMR/AMR --token eyJjdHkiOiJ0ZXh0XC9wbGFpbiIsImFsZyI6IkhTMjU2In0.eyJtYWlsIjoidml0YWxlZDJAbmloLmdvdiIsImV4c

In [None]:
ref_panel='/data/vitaled2/GenoTools/ref/PASS.Variantsbravo-dbsnp-all.tab'
check_bim_pl = '/data/vitaled2/GenoTools/ref/HRC-1000G-check-bim.pl'
imputed_out_dir = f'{out_genotypes}/coriell/imputed' 
rerun_geno = '/data/CARD/PD/GP2/genotypes/coriell/clean/coriell_AAC'
rerun_label = 'AAC'

with open(f'{swarm_scripts_dir}/rerun_imputation_pipeline.swarm','w') as f:

    label_temp_outdir = f'{basedir}/coriell/bed/{rerun_label}'
    label_temp = f'{label_temp_outdir}/{rerun_label}'
    label_outdir = f'{imputed_out_dir}/{rerun_label}'
    os.makedirs(f'{label_outdir}', exist_ok=True)
    os.makedirs(f'{label_temp_outdir}', exist_ok=True)
    impute_data = impute_data_prep(rerun_geno, label_temp, ref_panel, check_bim_pl)
    run_pipeline = f'python3 ../run_imputation_pipeline.py --geno {label_temp} --token {config.api_key} --out {label_outdir}/'
#         run_pipeline = f'python3 ../run_imputation_pipeline.py --geno {geno} --temp {label_temp} --token {config.api_key} --ref_panel {ref_panel} --check_bim_pl {check_bim_pl} --out {label_outdir}/'
    f.write(f'{run_pipeline}\n')
        
f.close()

!cat {swarm_scripts_dir}/rerun_imputation_pipeline.swarm

In [None]:
!swarm -f {swarm_scripts_dir}/rerun_imputation_pipeline.swarm -g 20 -t 16 --time=80:00:00 --logdir swarm --gres=lscratch:20 --partition=norm

In [35]:
!cat swarm/swarm_19495546_0.e

python3: can't open file '../run_imputation_pipeline.py': [Errno 2] No such file or directory


In [37]:
!cat swarm/swarm_19495546_0.e

python3: can't open file '../run_imputation_pipeline.py': [Errno 2] No such file or directory


In [None]:
# merge AAC with ref panel 

from QC.utils import merge_genos

aac_geno = '/data/CARD/PD/GP2/raw_genotypes/coriell/plink/coriell_callrate_sex_ancestry_AAC_related_het_variant'
out_dir = '/data/CARD/PD/GP2/genotypes/coriell/clean'
ref_dir_path = '/data/LNG/vitaled2/1kgenomes'
ref_panel = f'{ref_dir_path}/1kg_ashkj_ref_panel_gp2_pruned'

merged_ref_geno = f'{aac_geno}_merged_ref'



In [None]:
geno_ancestry_prune1 = f'{aac_geno}_ancestry_prune1'
geno_ancestry_prune2 = f'{aac_geno}_ancestry_prune2'
# geno_ancestry_prune3 = f'{geno_het}_ancestry_prune3'

# prune geno_het for geno, maf, hwe, and palindromes
bim = pd.read_csv(f'{aac_geno}.bim', sep='\t', header=None)

# find and drop palindromes in geno_het bim file
bim.columns = ['chr', 'rsid', 'kb', 'pos', 'a1', 'a2']
palindromes = bim.loc[((bim.a1 == 'A') & (bim.a2 == 'T')) | ((bim.a1 == 'T') & (bim.a2 == 'A')) | ((bim.a1 == 'C') & (bim.a2 == 'G')) | ((bim.a1 == 'G') & (bim.a2 == 'C'))]
palindromes['rsid'].to_csv(f'{aac_geno}_palindromes.snplist', header=False, index=False, sep='\t')

plink_cmd1 = f'plink --bfile {aac_geno}\
 --maf 0.05\
 --geno 0.01\
 --hwe 0.0001\
 --autosome\
 --allow-no-sex\
 --exclude {aac_geno}_palindromes.snplist\
 --make-bed\
 --out {geno_ancestry_prune1}' 

#remove high-LD regions
plink_cmd2 = f'plink --bfile {geno_ancestry_prune1}\
 --exclude range {ref_dir_path}/hg19_exclusion_regions.txt\
 --autosome\
 --allow-no-sex\
 --make-bed\
 --out {geno_ancestry_prune2}'

cmds = [plink_cmd1, plink_cmd2]

for cmd in cmds:
    shell_do(cmd)

In [None]:
# now get common snps between pruned ref panel and pruned geno
geno_pruned_bim = pd.read_csv(f'{geno_ancestry_prune2}.bim', sep='\t', header=None)
geno_pruned_bim.columns = ['chr', 'rsid', 'kb', 'pos', 'a1', 'a2']
ref_bim = pd.read_csv(f'{ref_panel}.bim', sep='\t', header=None)
ref_bim.columns = ['chr', 'rsid', 'kb', 'pos', 'a1', 'a2']

common_snps = ref_bim.merge(geno_pruned_bim, how='inner', on=['rsid'])

common_snps['rsid'].to_csv(f'{out_dir}/aac_geno_ref_panel_common.snps', sep='\t', header=False, index=False)

In [None]:
ref_panel_common_snps_geno_out = f'{out_dir}/aac_geno_ref_panel_common_snps'
ref_panel_common_snps = f'{out_dir}/aac_geno_ref_panel_common.snps'

ext_snps_cmd = f'plink --bfile {ref_panel} --extract {ref_panel_common_snps} --make-bed --out {ref_panel_common_snps_geno_out}'
shell_do(ext_snps_cmd)

In [None]:
# get reference alleles from ref_panel_common_snps
ref_panel_common_snps_ref_alleles = f'{ref_panel_common_snps_geno_out}.ref_allele'
ref_panel_common_snps_bim = pd.read_csv(f'{ref_panel_common_snps_geno_out}.bim', header=None, sep='\t')
ref_panel_common_snps_bim.columns = ['chr', 'rsid', 'kb', 'pos', 'a1', 'a2']
ref_panel_common_snps_bim[['rsid','a1']].to_csv(ref_panel_common_snps_ref_alleles, sep='\t', header=False, index=False)

In [None]:
geno_common_snps = f'{geno_ancestry_prune2}_common_snps'

ext_snps_cmd = f'plink --bfile {geno_ancestry_prune2} --extract {ref_panel_common_snps} --reference-allele {ref_panel_common_snps_ref_alleles} --make-bed --out {geno_common_snps}'
shell_do(ext_snps_cmd)

In [None]:
merged_ref_geno = f'{geno_ancestry_prune2}_merged_ref'

merge_genos(geno_common_snps, ref_panel_common_snps_geno_out, merged_ref_geno)

In [None]:
ancestry = pd.read_csv('/data/LNG/vitaled2/1kgenomes/ref_panel_ancestry.txt', sep='\t', header=None, names=['FID','IID','label'])
ref_fam = pd.read_csv(f'{ref_panel}.fam', sep=' ', header=None)
ref_labeled = ref_fam.merge(ancestry, how='left', left_on=[0,1], right_on=['FID','IID'])

In [None]:
geno_common_snps_df = pd.read_csv(f'{geno_common_snps}.fam', header=None, sep='\s+', usecols=[0,1], names=['FID','IID'])
geno_common_snps_df.loc[:,'label'] = 'new'
fs_labels = geno_common_snps_df.append(ref_labeled.loc[:,['FID','IID','label']])
fs_labels.to_csv(f'{merged_ref_geno}.labels', sep='\t', index=False)

In [None]:

ancestry_labels = f'{merged_ref_geno}.labels'
structure_out = f'{merged_ref_geno}_structure'
# fam = pd.read_csv(f'{ref_panel_prune_final}.fam', sep=' ', header=None)
structure = f'/data/vitaled2/ref_panel/fastStructure/structure.py'


# run for k=1-8 and use chooseK.py to select best
fs_swarm_script = f'{out_dir}/faststructure_choosek.swarm'
with open(fs_swarm_script, 'w') as f:
    k=8
    fs_cmd = f'bash /data/vitaled2/GWAS/gwas/faststructure_setup_and_run.sh -i {merged_ref_geno} -o {structure_out} -f {structure} -k {k}'
    f.write(f'{fs_cmd}\n')
f.close()

In [None]:
shell_do(f'swarm -f {fs_swarm_script} -g 16 --time=10:00:00 -t 20 --logdir {ref_dir_path}/swarm --partition=norm')

In [None]:
fam = pd.read_csv(f'{merged_ref_geno}.fam', sep=' ', header=None)
pop = pd.read_csv(f'{merged_ref_geno}.labels', sep='\t')
q_df = pd.read_csv(f'{structure_out}.8.meanQ', header=None, sep='\s+')
q_df.columns = [f'pop{i}' for i in range(len(q_df.columns))]

In [None]:
q_df['FID'], q_df['IID'] = fam[0], fam[1]
q_pop_merged = q_df.merge(pop, left_on=['FID','IID'], right_on=['FID','IID'])

In [None]:
q_pop_merged.to_csv(f'{out_dir}/aac_samples_labeled_faststructure.txt', sep='\t', header=True, index=False)

In [None]:
f'{out_dir}/aac_samples_labeled_faststructure.txt'