In [1]:
import pandas as pd
import os
import shutil
import numpy as np
import glob
import gzip

from QC.utils import shell_do
# from QC.imputation import impute_data_prep
# import QC.config as config

In [2]:
# some setup
# get precompiled binaries
#!cd /data/vitaled2/bin; wget https://software.broadinstitute.org/software/gtc2vcf/gtc2vcf_1.13-20211015.zip; unzip gtc2vcf_1.13-20211015.zip
# set path to gtc2vcf plugin
# !export BCFTOOLS_PLUGINS="/data/vitaled2/bin"


# download ref to create gtc
# !wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.fna.gz -P ../ref
# !gunzip ../ref/GCF_000001405.25_GRCh37.p13_genomic.fna.gz
# !cp ../ref/GCF_000001405.25_GRCh37.p13_genomic.fna ../ref/GCF_000001405.25_GRCh37.p13_genomic.fa

# get hg38 reference
# !wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.26_GRCh38/GCF_000001405.26_GRCh38_genomic.fna.gz -P ../ref
# !gunzip ../ref/GCF_000001405.26_GRCh38_genomic.fna.gz
# !cp ../ref/GCF_000001405.26_GRCh38_genomic.fna ../ref/GCF_000001405.26_GRCh38_genomic.fa

In [3]:
idat_path = '/data/CARD/PD/GP2/raw_genotypes/GP2_idats'
# gtc_path = '/data/CARD/PD/GP2/raw_genotypes/GP2_gtcs'
cnv_path = '/data/CARD/PD/GP2/cnvs'
# vcf_path = '/data/CARD/PD/GP2/raw_genotypes/GP2_vcfs'

ilmn_files_path = '/data/CARD/PD/GP2/ilmn_files'
# A1 is hg19, A2 is hg38. csv needed for indel calls
# bpm = f'{ilmn_files_path}/NeuroBooster_20042459_A1.bpm'
bpm_csv = f'{ilmn_files_path}/NeuroBooster_20042459_A2.csv' 
bpm = f'{ilmn_files_path}/NeuroBooster_20042459_A2.bpm'
egt = f'{ilmn_files_path}/NBSCluster_file_n1393_011921.egt'
iaap = f'{ilmn_files_path}/iaap-cli/iaap-cli'

key_dir = '/data/CARD/PD/GP2/key_files'
clin_dir = '/data/CARD/PD/GP2/clinical'

key_file = f'/data/CARD/PD/GP2/clinical/key_merge/GP2_master_key_full.txt'
clin_file = f'{clin_dir}/GP2_clinical.csv'

swarm_scripts_dir = f'/data/CARD/PD/GP2/swarm_scripts'


GTCtoVCF = '../GTCtoVCF/gtc_to_vcf.py'
ref_fasta = '/data/vitaled2/ref/GRCh38/GTCtoVCF_GRCh38_ref.fa'


key = pd.read_csv(f'{key_file}', sep='\t')
key.loc[:,'filename'] = key.loc[:,'SentrixBarcode_A'].astype(str) + '_' + key.loc[:,'SentrixPosition_A'].astype(str)

# define chromosomes for later use
chroms = [str(x) for x in range(1,23)] + ['X']

grch38_fasta = '/data/vitaled2/ref/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna'

genes = pd.read_csv('/data/CARD/PD/GP2/ref_panel/glist-hg38', sep='\s+', header=None, names=['chr','start','end','symbol'], dtype={'chr':str,'start':int,'end':int})
genes.columns = ['CHR','START','STOP','NAME']
genes[['NAME','CHR','START','STOP']].to_csv('/data/CARD/PD/GP2/ref_panel/glist_hg38_intervals.csv', index=False)
# gene_list originally from plink glist
gene_list = '/data/CARD/PD/GP2/ref_panel/glist_hg38_intervals.csv'



# Call CNVs

In [4]:
# get snp metrics
with open(f'{swarm_scripts_dir}/snp_metrics.swarm', 'w') as f:
    for code in key.SentrixBarcode_A.unique():
        idat_path_ = f'{idat_path}/{code}'
        metrics_out = f'{idat_path}'
        cmd = f'\
python /data/vitaled2/GenoTools/run_snp_metrics_pipeline.py \
--idat_path {idat_path_} \
--bpm {bpm} \
--bpm_csv {bpm_csv} \
--egt {egt} \
--ref_fasta {ref_fasta} \
--out_path {metrics_out} \
--iaap {iaap}'
        f.write(f'{cmd}\n')
f.close()

In [5]:
!swarm -f {swarm_scripts_dir}/snp_metrics.swarm -g 16 -t 16 --time=02:00:00 --logdir {swarm_scripts_dir}/logs --gres=lscratch:20 --partition=norm

36928493


In [85]:
# rerun fails after permissions fix
metric_fails = []
for sample in key.IID.unique():
    code = sample.split('_')[0]
    for chrom in chroms:
        mfile = f'{idat_path}/{code}/snp_metrics_{sample}_chr{chrom}.csv'
        if not os.path.isfile(mfile):
            metric_fails.append(code)
        else:
            pass
metric_fails_list = list(set(metric_fails))
            


In [74]:
with open(f'{swarm_scripts_dir}/snp_metrics2.swarm', 'w') as f:
    for code in metric_fails_list:
        idat_path_ = f'{idat_path}/{code}'
        metrics_out = f'{idat_path}'
        cmd = f'\
python /data/vitaled2/GenoTools/run_snp_metrics_pipeline.py \
--idat_path {idat_path_} \
--bpm {bpm} \
--bpm_csv {bpm_csv} \
--egt {egt} \
--ref_fasta {ref_fasta} \
--out_path {metrics_out} \
--iaap {iaap}'
        f.write(f'{cmd}\n')
f.close()

In [75]:
!swarm -f {swarm_scripts_dir}/snp_metrics2.swarm -g 16 -t 16 --time=02:00:00 --logdir {swarm_scripts_dir}/logs --gres=lscratch:20 --partition=norm

36975418


In [104]:
chroms = [str(i) for i in range(1,23)]
# samples = [s.split('/')[-1].replace('_Grn.idat','') for s in glob.glob(f'{idat_path}/*/*_Grn.idat')]

# snp_metrics_list = []
with open(f'{swarm_scripts_dir}/cnv.swarm', 'w') as f:
    missing_metrics = []
    for sample in key.IID.unique():
        code = sample.split('_')[0]
        for chrom in chroms:
            mfile = f'{idat_path}/{code}/snp_metrics_{sample}_chr{chrom}.csv'
            out_file = f'{idat_path}/{code}/CNV_{sample}_chr{chrom}.csv'
            intervals = f'/data/CARD/PD/GP2/ref_panel/glist_hg38_chr{chrom}.csv'
            cmd = f'\
python /data/vitaled2/GenoTools/run_cnv_pipeline.py \
--metrics {mfile} \
--out_path {out_file} \
--intervals {intervals} \
--min_variants 10 \
--kb_window 100'
            
            f.write(f'{cmd}\n')
f.close()





In [100]:
!swarm -f {swarm_scripts_dir}/cnv.swarm --time=40:00 --logdir {swarm_scripts_dir}/logs --gres=lscratch:20 --partition=norm

36981682


In [None]:
# get individual ancestry samples
# labels = ['AAC','AFR','AJ','EAS','EUR','FIN','SAS']
# suffixes = ['bed','bim','fam']

# for label in labels:
#     for suffix in suffixes:
#         !cp /data/CARD/PD/GP2/genotypes/GP2/round2/clean/GP2_round2_{label}.{suffix} {cnv_path}
    
# # now grab AMR and KZ
# for suffix in suffixes:
#     !cp /data/CARD/PD/GP2/genotypes/GP2/round2/temp/GP2_round2_AMR_KZ.{suffix} {cnv_path}
#     !cp /data/CARD/PD/GP2/genotypes/GP2/round2/temp/GP2_round2_AMR.{suffix} {cnv_path}

## Create Dosage Matrices

In [105]:
from CNV.cnv import create_cnv_dosage_matrices

release2_cohorts = ['BCM','UMD','SYNAPS-KZ','MDGAP-QSBB','CORIELL']

release2_key = key.loc[key.study.isin(release2_cohorts)]
release2_key[['FID','GP2sampleID']].to_csv(f'{cnv_path}/release2.samples', sep='\t', header=False, index=False)
release2_key[['GP2sampleID','IID']].to_csv(f'{cnv_path}/release2_sample_id_key.csv')
release2_covars = release2_key.loc[:,['FID', 'GP2sampleID','sex_for_qc', 'age', 'age_of_onset']]

labels = ['AAC','AFR','AJ','EAS','EUR','FIN','SAS','AMR','AMR_KZ']

with open(f'{swarm_scripts_dir}/cnv_dosages.swarm', 'w') as f:
    for label in labels:
        geno = f'{cnv_path}/GP2_round2_{label}'

        cmd1 = f'\
    plink \
    --bfile {geno} \
    --keep {cnv_path}/release2.samples \
    --make-bed \
    --out {geno}_release2'

        cmd2 = f'plink \
    --bfile {geno}_release2 \
    --pca \
    --out {geno}_release2'

        cmds = [cmd1, cmd2]

        for cmd in cmds:
            shell_do(cmd)

        pcs = pd.read_csv(f'{geno}_release2.eigenvec', sep='\s+')
        pc_num = pcs.iloc[:, 2:].shape[1]
        pc_names = ['FID','GP2sampleID'] + [f'PC{i}' for i in range(1, pc_num+1)]
        pcs.columns = pc_names

        cov = pcs.merge(release2_covars, on=['FID','GP2sampleID'], how='left')
        cov.age.fillna(cov.age.mean(), inplace=True)
        cov.age_of_onset.fillna(cov.age_of_onset.mean(), inplace=True)
        cov.sex_for_qc.fillna(cov.sex_for_qc.median(), inplace=True)
        cov.rename(columns={'sex_for_qc':'sex'})
        cov.to_csv(f'{geno}_release2.cov', sep='\t', header=True, index=False)
        
        samples = cov.merge(release2_key[['GP2sampleID','IID']], on='GP2sampleID', how='left')
        samples['IID'].to_csv(f'{geno}_release2_barcode.samples', header=False, index=False)
        
        for chrom in chroms:
            dosage_cmd = f'\
python /data/vitaled2/GenoTools/run_cnv_dosage_pipeline.py \
--metrics_in {idat_path} \
--chrom {chrom} \
--samples {geno}_release2_barcode.samples \
--out_path {geno}'
            f.write(f'{dosage_cmd}\n')
f.close()


In [49]:
!swarm -f {swarm_scripts_dir}/cnv_dosages.swarm --time=2:00:00 --logdir {swarm_scripts_dir}/logs --gres=lscratch:20 --partition=norm

37394518


## update to gp2 ids and grab phenotypes

In [107]:

dosage_files = ['BAF','L2R_DUP','L2R_DEL']
for label in labels:
    for chrom in chroms:
        for dose in dosage_files:
            
            dosagefile = f'{cnv_path}/GP2_round2_{label}_chr{chrom}_{dose}.csv'
            dosagefile_out = f'{cnv_path}/GP2_round2_{label}_chr{chrom}_{dose}_gp2ids.csv'
            pheno_out = f'{cnv_path}/GP2_round2_{label}_chr{chrom}_{dose}.pheno'
            
            dosage = pd.read_csv(dosagefile)
            dosage_merge = dosage.merge(release2_key[['GP2sampleID','IID']], left_on='sampleid', right_on='IID')
            dosage_out = dosage_merge.drop(columns=['sampleid','IID']).set_index('GP2sampleID').reset_index().rename(columns={'GP2sampleID':'sampleid'})
            dosage_out.to_csv(dosagefile_out, sep='\t', header=True, index=False)
            
            
            dosage_pheno = dosage_out.merge(key.loc[:,['GP2sampleID','pheno']], left_on='sampleid', right_on='GP2sampleID', how='left')
            dosage_pheno2 = dosage_pheno.drop(columns=["GP2sampleID"])
            dosage_pheno_out = dosage_pheno2.loc[dosage_pheno2.pheno != -9]
            dosage_pheno_out.loc[:,'pheno'] = np.where(dosage_pheno_out.pheno == 1, 0, 1)
            dosage_pheno_out[['sampleid','pheno']].to_csv(pheno_out, sep='\t', header=True, index=False)
            



## Run generate CNV-WAS summary stats

In [5]:
with open(f'{swarm_scripts_dir}/idat_to_gtc.swarm', 'w') as f:
    
    for code in key.SentrixBarcode_A.unique():
        os.makedirs(f'{gtc_path}/{code}', exist_ok=True)
        idat_to_gtc_cmd = f'\
{iaap} gencall \
{bpm} \
{egt} \
{gtc_path}/{code} \
-f {idat_path}/{code} \
-g \
-t 8'
        
        f.write(f'{idat_to_gtc_cmd}\n')
f.close()

In [39]:
!swarm -f {swarm_scripts_dir}/idat_to_gtc.swarm -g 15 -t 10 --time=10:00:00 --logdir {swarm_scripts_dir}/logs --gres=lscratch:20 --partition=norm --job-name idat_to_gtc

32381997


In [6]:
with open(f'{swarm_scripts_dir}/gtc2vcf.swarm','w') as f:
    for code in key.SentrixBarcode_A.unique():
        gtc2vcf_cmd = f'\
export BCFTOOLS_PLUGINS="/data/vitaled2/bin"; bcftools +gtc2vcf \
--no-version -Ob \
--bpm {bpm} \
--csv {bpm_csv} \
--egt {egt} \
--gtcs {gtc_path}/{code} \
--fasta-ref {grch38_fasta} \
--extra {vcf_path}/gp2_snps_{code}_metadata.tsv | \
bcftools norm --no-version -Oz -c w -f {grch38_fasta} > {vcf_path}/gp2_snps_{code}.vcf.gz'
    
        f.write(f'{gtc2vcf_cmd}\n')
f.close()


In [41]:
!swarm -f {swarm_scripts_dir}/gtc2vcf.swarm -g 64 -t 32 --time=10:00:00 --logdir {swarm_scripts_dir}/logs --gres=lscratch:20 --partition=norm --job-name gtc_to_vcf

32384390


In [7]:
# sort vcf
!mkdir {vcf_path}/tmp
with open(f'{swarm_scripts_dir}/bcftools_sort.swarm', 'w') as f:
    for code in key.SentrixBarcode_A.unique():
        sort_cmd = f'cd {vcf_path}; bcftools sort gp2_snps_{code}.vcf.gz -T ./tmp -Oz -o gp2_snps_sorted_{code}.vcf.gz'
        f.write(f'{sort_cmd}\n')
f.close()

mkdir: cannot create directory ‘/data/CARD/PD/GP2/raw_genotypes/GP2_vcfs/tmp’: File exists


In [43]:
!swarm -f {swarm_scripts_dir}/bcftools_sort.swarm -g 32 -t 16 --time=10:00:00 --logdir {swarm_scripts_dir}/logs --gres=lscratch:20 --partition=norm --job-name vcf_sort

32476908


In [8]:
# split indels and snps in vcf
with open(f'{swarm_scripts_dir}/vcftools_remove_indels.swarm', 'w') as f:
    for code in key.SentrixBarcode_A.unique():
        ext_snps_cmd = f'cd {vcf_path}; vcftools --gzvcf gp2_snps_sorted_{code}.vcf.gz --remove-indels --recode --recode-INFO-all --out gp2_snps_sorted_snps_only_{code}'
        f.write(f'{ext_snps_cmd}\n')
f.close()

In [45]:
!swarm -f {swarm_scripts_dir}/vcftools_remove_indels.swarm -g 16 -t 16 --time=10:00:00 --logdir {swarm_scripts_dir}/logs --gres=lscratch:20 --partition=norm --job-name vcf_remove_indels

32481160


In [9]:
# split indels and snps in vcf
with open(f'{swarm_scripts_dir}/vcftools_keep_indels.swarm', 'w') as f:
    for code in key.SentrixBarcode_A.unique():
        keep_indels_cmd = f'cd {vcf_path}; vcftools --gzvcf gp2_snps_sorted_{code}.vcf.gz --keep-only-indels --recode --recode-INFO-all --out gp2_snps_sorted_indels_only_{code}'
        f.write(f'{keep_indels_cmd}\n')
f.close()

In [47]:
!swarm -f {swarm_scripts_dir}/vcftools_keep_indels.swarm -g 16 -t 16 --time=10:00:00 --logdir {swarm_scripts_dir}/logs --gres=lscratch:20 --partition=norm --job-name vcf_keep_indels

32481168


In [62]:
# get snp info from each vcf
with open(f'{swarm_scripts_dir}/get_logr_baf.swarm', 'w') as f:
    for code in key.SentrixBarcode_A.unique():
        get_logr_baf = f'python3 process_vcf_snps.py --vcf {vcf_path}/gp2_snps_sorted_snps_only_{code}.recode.vcf --gene_ref /data/CARD/PD/GP2/ref_panel/glist-hg38 --out {vcf_path}/gp2_snp_metrics_{code}.txt'
        f.write(f'{get_logr_baf}\n')
f.close()

In [63]:
!swarm -f {swarm_scripts_dir}/get_logr_baf.swarm -g 64 -t 32 --time=10:00:00 --logdir {swarm_scripts_dir}/logs --gres=lscratch:20 --partition=norm --job-name get_snp_metrics

32974379


In [37]:
# get snp info from each vcf
with open(f'{swarm_scripts_dir}/clean_snp_metrics.swarm', 'w') as f:
    for code in key.SentrixBarcode_A.unique():
        clean_snp_metrics = f'python3 clean_snp_metrics.py --infile {vcf_path}/gp2_snp_metrics_{code}.txt --outpath {vcf_path}/gp2_snp_metrics'
        f.write(f'{clean_snp_metrics}\n')
f.close()

In [38]:
!swarm -f {swarm_scripts_dir}/clean_snp_metrics.swarm -g 16 -t 16 --time=10:00:00 --logdir {swarm_scripts_dir}/logs --gres=lscratch:20 --partition=norm --job-name clean_snp_metrics

34805915


In [36]:
!cat {swarm_scripts_dir}/logs/clean_snp_metrics_34805480_0.e

  File "clean_snp_metrics.py", line 11
    infile = args.in
                   ^
SyntaxError: invalid syntax
  File "clean_snp_metrics.py", line 11
    infile = args.in
                   ^
SyntaxError: invalid syntax


In [34]:
# # split each file by chr and iid

# infile = f'{vcf_path}/gp2_snp_metrics_204697840024.txt'
# df = pd.read_csv(infile, sep='\t')


# cols = ['CHROM','POS','ID','REF','ALT','sampleid','BAF','LRR']

# for iid in df.sampleid.unique():
#     for chrom in df.CHROM.unique():
        
#         outfile = f'{vcf_path}/gp2_snp_metrics_{iid}_chr{chrom}.txt' 
#         out_df = df.loc[(df.CHROM==chrom) & (df.sampleid==iid)]
#         out_df[['CHROM','ID','POS','BAF','LRR']].to_csv(outfile, sep='\t', header=True, index=False)
        

In [59]:
metrics = pd.read_csv(f'{vcf_path}/gp2_snp_metrics_{code}.txt', sep='\t', nrows=100, dtype={'CHROM':str,'POS':int})
metrics.loc[:,'CHROM'] = metrics.loc[:,'CHROM'].str.replace('chr','')
metrics.loc[:,'gene'] = np.nan

In [60]:
# for i, gene in enumerate(gene_list.symbol):
# #     print(gene_list.loc[i,:])
#     chrom = gene_list.loc[i,'chr']
#     start = gene_list.loc[i,'start'] - 100000
#     end = gene_list.loc[i, 'end'] + 100000
#     metrics.loc[((metrics.CHROM==chrom) & (metrics.POS>=start) & (metrics.POS<=end)),'gene'] = gene
    


## Dev for CNV module in GenoTools

In [25]:
# import os
# import subprocess
# from QC.utils import shell_do

# def idat_snp_metrics(idat_path, bpm, bpm_csv, egt, ref_fasta, out_path, iaap=iaap):
#     '''
#     current structure of idat storage is such that a directory of each SentrixBarcode_A with all idats for that barcode in it
#     for ex.
#     1112223334
#         --> 1112223334_R01C01_Red.idat
#         --> 1112223334_R01C01_Grn.idat
#         --> 1112223334_R01C02_Red.idat
#         --> 1112223334_R01C02_Grn.idat
#         etc.
        
#     '''
#     out_tmp = f'{out_path}/tmp'
#     os.makedirs(out_tmp, exist_ok=True)
#     barcode = idat_path.split('/')[-1].split('_')[0]
#     barcode_out_path = f'{out_path}/{barcode}'
#     os.makedirs(barcode_out_path, exist_ok=True)
    
    
#     idat_to_gtc_cmd = f'\
# {iaap} gencall \
# {bpm} \
# {egt} \
# {barcode_out_path} \
# -f {idat_path} \
# -g \
# -t 8'

#     # export path to plugins temporarily for biowulf. will figure this out later
#     gtc2vcf_cmd = f'\
# export BCFTOOLS_PLUGINS="/data/vitaled2/bin"; \
# bcftools +gtc2vcf \
# --no-version -Ob \
# --bpm {bpm} \
# --csv {bpm_csv} \
# --egt {egt} \
# --gtcs {barcode_out_path} \
# --fasta-ref {ref_fasta} | \
# bcftools norm --no-version -Oz -c w -f {ref_fasta} > {barcode_out_path}/{barcode}.vcf.gz'
    
#     # --extra {vcf_path}/gp2_snps_{code}_metadata.tsv | \

#     # sort vcf
#     sort_cmd = f'\
# bcftools \
# sort {barcode_out_path}/{barcode}.vcf.gz \
# -T {out_tmp}/ \
# -Oz -o {barcode_out_path}/{barcode}_sorted.vcf.gz'
# # cd {barcode_out_path} && \

#     # split indels and snps in vcf
#     ext_snps_cmd = f'\
# vcftools --gzvcf \
# {barcode_out_path}/{barcode}_sorted.vcf.gz \
# --remove-indels \
# --recode \
# --recode-INFO-all \
# --out {barcode_out_path}/{barcode}_sorted_snps'
# # cd {barcode_out_path} && \


#     # split indels and snps in vcf
# #     keep_indels_cmd = f'\
# # cd {barcode_out_path}; \
# # vcftools --gzvcf \
# # {barcode}_sorted.vcf.gz \
# # --keep-only-indels \
# # --recode \
# # --recode-INFO-all \
# # --out {code}_indels'

#     # get snp info from each vcf
#     get_logr_baf = f'\
# python3 process_vcf_snps.py \
# --vcf {barcode_out_path}/{barcode}_sorted_snps.recode.vcf \
# --outfile {barcode_out_path}/snp_metrics_{barcode}.csv'


#     # get snp info from each vcf
#     clean_snp_metrics = f'\
# python3 clean_snp_metrics.py \
# --infile {barcode_out_path}/snp_metrics_{barcode}.csv \
# --outfile {barcode_out_path}/snp_metrics'

#     cmds = [idat_to_gtc_cmd, gtc2vcf_cmd, sort_cmd, ext_snps_cmd, get_logr_baf, clean_snp_metrics]
# #     cmds = [get_logr_baf, clean_snp_metrics]
# #     cmds = [get_logr_baf]
# #     cmds = [clean_snp_metrics]
#     for cmd in cmds:
#         if cmd == gtc2vcf_cmd:
#             subprocess.call(cmd, shell=True)
#         else:
#             shell_do(cmd)
            
 
#     # output snp metrics paths
#     chroms = [str(i) for i in range(1,23)] + ['X','Y']
#     samples = [s.split('/')[-1].replace('_Red.idat','') for s in glob.glob(f'{idat_path}/*_Red.idat')]
    
#     outfiles = []
#     for sample in samples:
#         for chrom in chroms:
#             outfile = f'{barcode_out_path}/snp_metrics_{sample}_chr{chrom}.csv'
#             outfiles.append(outfile)
            
#     return outfiles


# for sample in samples:
#     for chrom in chroms:

#     call_cnvs_cmd = f'\
#     python3 cnv_gene_caller_alpha.py \
#     --infile {barcode_out_path}/snp_metrics_{sample}_chr{chrom}.csv \
#     --outfile {barcode_out_path}/{sample}_chr{chrom} \
#     --intervals {gene_list}'
            
#             print(call_cnvs_cmd)
#             shell_do(call_cnvs_cmd)


In [20]:
# infile = f'{test_path}/snp_metrics_206046180074.csv'
# outfile = f'{test_path}/snp_metrics'

In [70]:
!ls {idat_path}

206046180074_R01C01_Grn.idat  206046180074_R05C01_Grn.idat
206046180074_R01C01_Red.idat  206046180074_R05C01_Red.idat
206046180074_R02C01_Grn.idat  206046180074_R06C01_Grn.idat
206046180074_R02C01_Red.idat  206046180074_R06C01_Red.idat
206046180074_R03C01_Grn.idat  206046180074_R07C01_Grn.idat
206046180074_R03C01_Red.idat  206046180074_R07C01_Red.idat
206046180074_R04C01_Grn.idat  206046180074_R08C01_Grn.idat
206046180074_R04C01_Red.idat  206046180074_R08C01_Red.idat


In [58]:
idat_path = '/data/vitaled2/cnv_test/206046180074'
test_out = '/data/vitaled2/cnv_test'
# snp_metrics_files = idat_snp_metrics(idat_path, bpm, bpm_csv, egt, ref_fasta, test_out)

with open(f'{swarm_scripts_dir}/call_cnvs.swarm', 'w') as f:
    
    for mfile in snp_metrics_files:
        out_prefix = mfile.replace('.csv','').replace('snp_metrics_', '')
#         chrom = out_prefix.split('_')[-1]
#         gene_chrom_list = f'/data/CARD/PD/GP2/ref_panel/glist_hg38_{chrom}.csv'

        call_cnvs_cmd = f'\
python3 cnv_gene_caller_alpha.py \
--infile {mfile} \
--outfile {out_prefix} \
--intervals {gene_list}'

        f.write(f'{call_cnvs_cmd}\n')
f.close()

In [59]:
!swarm -f {swarm_scripts_dir}/call_cnvs.swarm -g 16 -t 16 --time=10:00:00 --logdir {swarm_scripts_dir}/logs --gres=lscratch:20 --partition=norm

35729936
