In [30]:
import pandas as pd
import glob
import numpy as np
import os
import shutil

In [53]:
gd_path = '/data/CARD/PD/GP2/raw_genotypes/GD'
idat_path = f'{gd_path}/idats'
swarm_scripts_dir = f'{gd_path}/swarm_scripts'
raw_plink_path = f'{gd_path}/plink'
gd_qc_path = f'{gd_path}/QC'

ilmn_files_path = '/data/CARD/PD/GP2/ilmn_files'
bpm = f'{ilmn_files_path}/NeuroBooster_20042459_A1.bpm'
egt = f'{ilmn_files_path}/NBSCluster_file_n1393_011921.egt'
iaap = f'{ilmn_files_path}/iaap-cli/iaap-cli'

clin_dir = '/data/CARD/PD/GP2/clinical'
key_dir = '/data/CARD/PD/GP2/key_files'
gd_key = f'{key_dir}/gd_key.csv'


In [46]:
nhgri_clin = pd.read_csv(f'{clin_dir}/sample_sheet_nhgri.csv')
ny_clin = pd.read_csv(f'{clin_dir}/sample_sheet_newyork.csv')
key = pd.read_csv(gd_key)
key['filename'] = key['SentrixBarcode_A'].astype(str) + '_' + key['SentrixPosition_A']

key.loc[:,'FID'] = '0'

key[['FID', 'filename', 'FID', 'Sample_ID']].to_csv(f'{gd_path}/update_ids.txt', sep='\t', header=None, index=None)

In [36]:
with open(f'{swarm_scripts_dir}/idat_to_ped.swarm', 'w') as f:
    
    for code in key.SentrixBarcode_A.unique():
        
        idat_to_ped_cmd = f'\
{iaap} gencall \
{bpm} \
{egt} \
{raw_plink_path}/ \
-f {idat_path}/{code} \
-p \
-t 8'
        
        f.write(f'{idat_to_ped_cmd}\n')
f.close()

In [38]:
!swarm -f {swarm_scripts_dir}/idat_to_ped.swarm -g 32 -t 16 --time=10:00:00 --logdir {swarm_scripts_dir}/logs --gres=lscratch:20 --partition=norm

21031358


In [39]:
# copy map file to match name of each ped
map_file = f'{raw_plink_path}/NeuroBooster_20042459_A1.map'
for filename in key.filename:
    ped = f'{raw_plink_path}/{filename}.ped'
    out_map = f'{raw_plink_path}/{filename}.map'
    if os.path.isfile(ped):
        shutil.copyfile(src=map_file, dst=out_map)
    else:
        print(f'{ped} does not exist!')
        print(f'{out_map} creation cancelled')

In [40]:
with open(f'{swarm_scripts_dir}/make_bed.swarm', 'w') as f:
    for filename in key.filename:
        ped = f'{raw_plink_path}/{filename}'
        make_bed_cmd = f'\
plink \
--file {ped} \
--make-bed \
--out {raw_plink_path}/{filename}'

        f.write(f'{make_bed_cmd}\n')
f.close()

In [42]:
!swarm -f {swarm_scripts_dir}/make_bed.swarm -g 64 -t 32 --time=10:00:00 --logdir swarm --gres=lscratch:20 --partition=norm

21031946


In [55]:
# write plink merge command
with open(f"{raw_plink_path}/merge_bed.list", 'w') as f:
    for filename in key.filename:
        bed = f'{raw_plink_path}/{filename}'
        if os.path.isfile(f'{bed}.bed'):
            f.write(f'{bed}\n')
        else:
            print(f'{bed} does not exist!!!')
f.close()

with open(f"{swarm_scripts_dir}/merge.swarm", 'w') as f:

    plink_merge_cmd = f'\
plink \
--merge-list {raw_plink_path}/merge_bed.list \
--update-ids {gd_path}/update_ids.txt \
--make-bed \
--out {gd_qc_path}/GD'
    f.write(f"{plink_merge_cmd}")
f.close()

In [56]:
!swarm -f {swarm_scripts_dir}/merge.swarm -g 64 -t 32 --time=10:00:00 --logdir swarm --gres=lscratch:20 --partition=norm

21034777


In [66]:
geno_path = f'{gd_qc_path}/GD'
ref_dir_path = '/data/LNG/vitaled2/1kgenomes'
ref_panel = f'{ref_dir_path}/1kg_ashkj_ref_panel_gp2_pruned'
ref_labels = f'{ref_dir_path}/ref_panel_ancestry.txt'
out_path = '/data/CARD/PD/GP2/genotypes/GD/clean/GD'

In [67]:
with open(f'{swarm_scripts_dir}/run_qc_pipeline.swarm','w') as f:
    run_pipeline = f'python3 run_gd_pipeline.py --geno {geno_path} --ref {ref_panel} --ref_labels {ref_labels} --out {out_path}'
    f.write(f'{run_pipeline}\n')
f.close()
!cat {swarm_scripts_dir}/run_qc_pipeline.swarm

python3 run_gd_pipeline.py --geno /data/CARD/PD/GP2/raw_genotypes/GD/QC/GD --ref /data/LNG/vitaled2/1kgenomes/1kg_ashkj_ref_panel_gp2_pruned --ref_labels /data/LNG/vitaled2/1kgenomes/ref_panel_ancestry.txt --out /data/CARD/PD/GP2/genotypes/GD/clean/GD


In [68]:
!swarm -f {swarm_scripts_dir}/run_qc_pipeline.swarm -g 64 -t 32 --time=10:00:00 --logdir {swarm_scripts_dir}/logs --gres=lscratch:20 --partition=norm

21042872


In [70]:
QC_metrics_path = f'{out_path}.QC.metrics.h5'

metrics_df = pd.read_hdf(QC_metrics_path, key='QC')
ancestry_counts_df = pd.read_hdf(QC_metrics_path, key='ancestry_counts')
pred_ancestry_labels = pd.read_hdf(QC_metrics_path, key='ancestry_labels')
conf_mat_df = pd.read_hdf(QC_metrics_path, key='confusion_matrix', index=True)
ref_pcs = pd.read_hdf(QC_metrics_path, key='ref_pcs')
projected_pcs = pd.read_hdf(QC_metrics_path, key='projected_pcs')
total_umap = pd.read_hdf(QC_metrics_path, key='total_umap')
ref_umap = pd.read_hdf(QC_metrics_path, key='ref_umap')
new_samples_umap = pd.read_hdf(QC_metrics_path, key='new_samples_umap')

FileNotFoundError: File /data/CARD/PD/GP2/genotypes/GD/clean/GD.QC.metrics.h5 does not exist

In [71]:
!ls /data/CARD/PD/GP2/genotypes/GD/clean/

GD_AJ.bed  GD_AJ.hh    GD_AMR.bim  GD_AMR.log  GD_EUR.fam
GD_AJ.bim  GD_AJ.log   GD_AMR.fam  GD_EUR.bed  GD_EUR.hh
GD_AJ.fam  GD_AMR.bed  GD_AMR.hh   GD_EUR.bim  GD_EUR.log


In [78]:
!cat swarm/swarm_21042872_0.e

/data/CARD/PD/GP2/raw_genotypes/GD/QC/GD_missing.hh ); many commands treat
these as missing.
treat these as missing.
/data/CARD/PD/GP2/raw_genotypes/GD/QC/GD_callrate.hh ); many commands treat
these as missing.
treat these as missing.
/data/CARD/PD/GP2/raw_genotypes/GD/QC/GD_callrate_sex_tmp1.hh ); many commands
treat these as missing.
treat these as missing.
/data/CARD/PD/GP2/raw_genotypes/GD/QC/GD_callrate_sex_tmp2.hh ); many commands
treat these as missing.
/data/CARD/PD/GP2/raw_genotypes/GD/QC/GD_callrate_sex.hh ); many commands treat
these as missing.
treat these as missing.
/data/CARD/PD/GP2/raw_genotypes/GD/QC/GD_callrate_sex_ancestry_AJ.hh ); many
commands treat these as missing.
treat these as missing.
/data/CARD/PD/GP2/raw_genotypes/GD/QC/GD_callrate_sex_ancestry_EUR.hh ); many
commands treat these as missing.
treat these as missing.
/data/CARD/PD/GP2/raw_genotypes/GD/QC/GD_callrate_sex_ancestry_AMR.hh ); many
commands treat these as missing.
treat these as missing.
treat the

In [79]:
!cat /data/CARD/PD/GP2/raw_genotypes/GD/QC/GD_callrate_sex_ancestry_SAS_related_het.log

PLINK v1.90b4.4 64-bit (21 May 2017)
Options in effect:
  --bfile /data/CARD/PD/GP2/raw_genotypes/GD/QC/GD_callrate_sex_ancestry_SAS_related
  --make-bed
  --out /data/CARD/PD/GP2/raw_genotypes/GD/QC/GD_callrate_sex_ancestry_SAS_related_het
  --remove /data/CARD/PD/GP2/raw_genotypes/GD/QC/GD_callrate_sex_ancestry_SAS_related_het.outliers

Hostname: cn3483
Working directory: /gpfs/gsfs12/users/vitaled2/GenoTools/GP2_data_processing
Start time: Fri Aug 13 04:22:16 2021

Random number seed: 1628842936
257652 MB RAM detected; reserving 128826 MB for main workspace.
2004347 variants loaded from .bim file.
2 people (0 males, 2 females) loaded from .fam.
Error: No people remaining after --remove.

End time: Fri Aug 13 04:22:17 2021
