In [None]:
import pandas as pd
import os
import shutil

In [None]:
basedir = '/data/CARD/PD/GP2/raw_genotypes'
out_genotypes = '/data/CARD/PD/GP2/genotypes'
shulman_ny_path = f'{basedir}/shulman_ny'
gtc_file_path = f'{shulman_ny_path}/GP2_GCT_files'
idat_file_path = f'{shulman_ny_path}/GP2_Shulman'
key_file = f'{gtc_file_path}/Key File_FINAL_Shulman_and_NY_011421.txt'
manifest_txt_path = f'{gtc_file_path}/FINALSS_after_rerun__Shulman_and_NY_011421.csv'
bpm = f'{gtc_file_path}/NeuroBooster_20042459_A1.bpm'
cluster_file = f'{gtc_file_path}/NBSCluster_file_n1393_011921.egt'

#software paths
GTCtoVCF = 'GTCtoVCF/gtc_to_vcf.py'
iaap = '../executables/iaap-cli-linux-x64-1.1.0-sha.80d7e5b3d9c1fdfc2e99b472a90652fd3848bbc7/iaap-cli/iaap-cli'

ref_fasta = 'ref_data/hg38_ref.fa'
shulman_gtc_path = f'{basedir}/SHULMAN/gtc_files'
shulman_idat_path = f'{basedir}/SHULMAN/idats'
shulman_out = f'{out_genotypes}/SHULMAN'
ny_gtc_path = f'{basedir}/NY/gtc_files'
ny_idat_path = f'{basedir}/NY/idats'
ny_out = f'{out_genotypes}/NY'

In [None]:
manifest = pd.read_csv(manifest_txt_path, header=10)

In [None]:
# create new directories to store split cohorts
!mkdir {basedir}/SHULMAN
!mkdir {basedir}/NY
!mkdir {basedir}/SHULMAN/idats
!mkdir {basedir}/NY/idats
!mkdir {out_genotypes}
!mkdir {out_genotypes}/SHULMAN
!mkdir {out_genotypes}/SHULMAN/ped
!mkdir {out_genotypes}/NY
!mkdir {out_genotypes}/NY/ped
!mkdir {shulman_out}/plink
!mkdir {ny_out}/plink

In [None]:
# create filenaames and split manifest into respective cohorts
manifest['filename'] = manifest['SentrixBarcode_A'].astype(str) + '_' + manifest['SentrixPosition_A']
shulman = manifest.loc[manifest.Study == 'Shulman']
ny = manifest.loc[manifest.Study == 'NY']

In [None]:
# create directory for each sentrix barcode A if they don't already exist-- 
def split_cohort_idats(manifest_df, name, idat_file_path):
        
    for code in manifest_df.SentrixBarcode_A.unique():
        if os.path.exists(f'{basedir}/{name}/idats/{code}'):
            print(f'{basedir}/{name}/idats/{code} already exists')
        else:
            os.mkdir(f'{basedir}/{name}/idats/{code}')

    missing_idats = []

    # split idats into respective cohorts and populated each sentrix barcode A directory
    for i, filename in enumerate(manifest_df.filename):
        sentrix_code = manifest_df.SentrixBarcode_A.iloc[i]
        grn = f'{idat_file_path}/{sentrix_code}/{filename}_Grn.idat'
        red = f'{idat_file_path}/{sentrix_code}/{filename}_Red.idat'

        if os.path.isfile(grn):
            shutil.copyfile(src=grn, dst=f'{basedir}/{name}/idats/{sentrix_code}/{filename}_Grn.idat')
        else:
            missing_idats.append(grn)
            
        if os.path.isfile(red):
            shutil.copyfile(src=red, dst=f'{basedir}/{name}/idats/{sentrix_code}/{filename}_Red.idat')
        else:
            missing_idats.append(red)
        

    len(missing_idats)

In [None]:
split_cohort_idats(manifest_df=shulman, name='SHULMAN', idat_file_path=idat_file_path)
split_cohort_idats(manifest_df=ny, name='NY', idat_file_path=idat_file_path)


In [None]:
# swarm command to run in parallel
with open('shulman_idat_to_ped.swarm','w') as f:
    
    for code in shulman.SentrixBarcode_A.unique():
        
        shulman_idat_to_ped_cmd = f'\
{iaap} gencall \
{bpm} \
{cluster_file} \
{shulman_out}/ped/ \
-f {shulman_idat_path}/{code} \
-p \
-t 8'
        
        f.write(f'{shulman_idat_to_ped_cmd}\n')
f.close()

In [None]:
# !swarm -f shulman_idat_to_ped.swarm -g 32 -t 16 --time=10:00:00 --logdir swarm --gres=lscratch:20 --partition=norm

In [None]:
# copy map file to match name of each ped
map = f'{shulman_out}/ped/NeuroBooster_20042459_A1.map'
for filename in shulman.filename:
    ped = f'{shulman_out}/ped/{filename}.ped'
    out_map = f'{shulman_out}/ped/{filename}.map'
    if os.path.isfile(ped):
        shutil.copyfile(src=map, dst=out_map)
    else:
        print(f'{ped} does not exist!')
        print(f'{out_map} creation cancelled')


In [None]:
for filename in shulman.filename:
    ped = f'{shulman_out}/ped/{filename}'
    make_bed_cmd = f'\
plink \
--file {ped} \
--make-bed \
--out {shulman_out}/plink/{filename}'

    shell_do(make_bed_cmd)

In [None]:
# write plink merge command
with open("shulman_merge_bed.list", 'w') as f:
    for filename in shulman.filename:
        bed = f'{shulman_out}/plink/{filename}'
        f.write(f'{bed}\n')
f.close()

with open("shulman_merge.swarm", 'w') as f:

    plink_merge_cmd = f'\
plink \
--merge-list shulman_merge_bed.list \
--make-bed \
--out {shulman_out}/plink/shulman'
    f.write(f"{plink_merge_cmd}")
f.close()

In [None]:
!swarm -f shulman_merge.swarm -g 64 -t 32 --time=10:00:00 --logdir swarm --gres=lscratch:20 --partition=norm

In [None]:
# get basic statistics
plink_miss_cmd = f'\
plink \
--bfile {shulman_out}/plink/shulman \
--missing \
--out {shulman_out}/plink/shulman'

shell_do(plink_miss_cmd)

In [None]:
# get average call rate
lmiss = pd.read_csv(f'{shulman_out}/plink/shulman.lmiss', sep='\s+')
imiss = pd.read_csv(f'{shulman_out}/plink/shulman.imiss', sep='\s+')
avg_call_rate = 100-lmiss.F_MISS.mean()
avg_geno_rate = 100-imiss.F_MISS.mean()
print(f'Average Call Rate: {avg_call_rate}')
print(f'Average Genotyping Rate: {avg_geno_rate}')