In [None]:
import pandas as pd
import os
import shutil

In [None]:
# clone ILMN GTCtoVCF github repo
# !git clone https://github.com/Illumina/GTCtoVCF.git
# get hg38 reference
# !wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.26_GRCh38/GCF_000001405.26_GRCh38_genomic.fna.gz -P ref_data
# !gunzip ref_data/GCF_000001405.26_GRCh38_genomic.fna.gz

In [None]:
basedir = '/data/CARD/PD/GP2/raw_genotypes'
out_genotypes = '/data/CARD/PD/GP2/genotypes'
shulman_ny_path = f'{basedir}/shulman_ny'
gtc_file_path = f'{shulman_ny_path}/GP2_GCT_files'
idat_file_path = f'{shulman_ny_path}/GP2_Shulman'
key_file = f'{gtc_file_path}/Key File_FINAL_Shulman_and_NY_011421.txt'
manifest_txt_path = f'{gtc_file_path}/FINALSS_after_rerun__Shulman_and_NY_011421.csv'
bpm = f'{gtc_file_path}/NeuroBooster_20042459_A1.bpm'
cluster_file = f'{gtc_file_path}/NBSCluster_file_n1393_011921.egt'

#software paths
GTCtoVCF = 'GTCtoVCF/gtc_to_vcf.py'
iaap = 'iaap-cli-linux-x64-1.1.0-sha.80d7e5b3d9c1fdfc2e99b472a90652fd3848bbc7/iaap-cli/iaap-cli'

ref_fasta = 'ref_data/hg38_ref.fa'
shulman_gtc_path = f'{basedir}/SHULMAN/gtc_files'
shulman_idat_path = f'{basedir}/SHULMAN/idats'
shulman_out = f'{out_genotypes}/SHULMAN'
ny_gtc_path = f'{basedir}/NY/gtc_files'
ny_idat_path = f'{basedir}/NY/idats'
ny_out = f'{out_genotypes}/NY'

In [None]:
manifest = pd.read_csv(manifest_txt_path, header=10)

In [None]:
# create new directories to store split cohorts
!mkdir {basedir}/SHULMAN
!mkdir {basedir}/NY
!mkdir {basedir}/SHULMAN/gtc_files
!mkdir {basedir}/NY/gtc_files
!mkdir {basedir}/SHULMAN/idats
!mkdir {basedir}/NY/idats
!mkdir {out_genotypes}
!mkdir {out_genotypes}/SHULMAN
!mkdir {out_genotypes}/SHULMAN/vcfs
!mkdir {out_genotypes}/SHULMAN/iaap_called_gtcs
!mkdir {out_genotypes}/SHULMAN/ped
!mkdir {out_genotypes}/NY
!mkdir {out_genotypes}/NY/vcfs
!mkdir {out_genotypes}/NY/iaap_called_gtcs
!mkdir {out_genotypes}/NY/ped
!mkdir {shulman_out}/plink
!mkdir {ny_out}/plink

In [None]:
# create filenaames and split manifest into respective cohorts
manifest['filename'] = manifest['SentrixBarcode_A'].astype(str) + '_' + manifest['SentrixPosition_A']
shulman = manifest.loc[manifest.Study == 'Shulman']
ny = manifest.loc[manifest.Study == 'NY']

In [None]:
# copy gtc files to respective directories
for filename in shulman.filename:
    shutil.copyfile(src=f'{gtc_file_path}/{filename}.gtc',dst=f'{basedir}/SHULMAN/gtc_files/{filename}.gtc')
for filename in ny.filename:
    shutil.copyfile(src=f'{gtc_file_path}/{filename}.gtc',dst=f'{basedir}/NY/gtc_files/{filename}.gtc')

In [None]:
# swarm command to run in parallel
with open('shulman_gtc_to_vcf.swarm','w') as f:
    
    for filename in shulman.filename:
        
        shulman_gtc_to_vcf_cmd = f'\
python3 {GTCtoVCF} \
--gtc-paths {shulman_gtc_path}/{filename}.gtc \
--manifest-file {bpm} \
--genome-fasta-file {ref_fasta} \
--output-vcf-path {shulman_out}/vcfs/{filename}.vcf \
--skip-indels'
        
        f.write(f'{shulman_gtc_to_vcf_cmd}\n')
f.close()

In [None]:
# !swarm -f shulman_gtc_to_vcf.swarm -g 16 -t 16 --time=10:00:00 --logdir swarm --gres=lscratch:20 --partition=norm

In [None]:
# swarm command to run in parallel
with open('ny_gtc_to_vcf.swarm','w') as f:
    
    for filename in ny.filename:
        
        ny_gtc_to_vcf_cmd = f'\
python3 {GTCtoVCF} \
--gtc-paths {ny_gtc_path}/{filename}.gtc \
--manifest-file {bpm} \
--genome-fasta-file {ref_fasta} \
--output-vcf-path {ny_out}/vcfs/{filename}.vcf \
--skip-indels'
        
        f.write(f'{ny_gtc_to_vcf_cmd}\n')
f.close()

In [None]:
# !swarm -f ny_gtc_to_vcf.swarm -g 16 -t 16 --time=10:00:00 --logdir swarm --gres=lscratch:20 --partition=norm

In [None]:
# use picard to merge vcfs
# create vcf_list
with open('vcf.list','w') as f:
    
    for filename in shulman.filename:
        
        f.write(f'{shulman_out}/vcfs/{filename}.vcf\n')

f.close()

# write picard batch job
with open('merge_vcfs.sh','w') as f:
    f.write(f'#!/bin/bash\n\
set -e\n\
module load picard\n\
java -Xmx4g -XX:ParallelGCThreads=16 -jar $PICARDJARPATH/picard.jar CombineGenotypingArrayVcfs I=vcf.list O={shulman_out}/vcfs/shulman_merged.vcf')
f.close()

In [None]:
# !swarm -f shulman_vcf_to_bed.swarm -g 32 -t 16 --time=10:00:00 --logdir swarm --gres=lscratch:20 --module plink --partition=norm
# !sbatch --cpus-per-task=32 merge_vcfs.sh

In [None]:
# # swarm command to run in parallel
with open('shulman_idat_to_gtc.swarm','w') as f:
    
    for code in shulman.SentrixBarcode_A.unique():
        
        shulman_idat_to_gtc_cmd = f'\
{iaap} gencall \
{bpm} \
{cluster_file} \
{shulman_out}/iaap_called_gtcs/ \
-f {shulman_idat_path}/{code} \
-g \
-t 16'
        
        f.write(f'{shulman_idat_to_gtc_cmd}\n')
f.close()