In [1]:
import pandas as pd
import os
import shutil
import numpy as np
import glob
from sklearn.preprocessing import MinMaxScaler
import statsmodels.api as sm

In [2]:
idat_path = '/data/GP2/raw_genotypes/idats/'
swarm_scripts_dir = f'/data/CARD/PD/GP2/swarm_scripts'

ilmn_files_path = '/data/CARD/PD/GP2/ilmn_files'
# A1 is hg19, A2 is hg38. csv needed for indel calls
bpm_csv = f'{ilmn_files_path}/NeuroBooster_20042459_A2.csv' 
bpm = f'{ilmn_files_path}/NeuroBooster_20042459_A2.bpm'
egt = f'{ilmn_files_path}/recluster_09272022.egt'
iaap = f'{ilmn_files_path}/iaap-cli/iaap-cli'

ref_fasta = '/data/vitaled2/ref/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna'

chroms = [str(i) for i in range(1,23)] + ['X','Y','M']


key_file = '/data/GP2/clinical/master_key/GP2_master_key_release4.txt'
key = pd.read_csv(f'{key_file}', sep='\t')
key.loc[:,'filename'] = key.loc[:,'SentrixBarcode_A'].astype(str) + '_' + key.loc[:,'SentrixPosition_A'].astype(str)

cohorts = ['APGS',
 'BCM',
 'BIOFIND',
 'BLAACPD-KPM',
 'BLAACPD-RUSH',
 'BLAACPD-UAB',
 'BLAACPD-UC',
 'COPN',
 'CORIELL',
 'KOC',
 'LCC',
 'MDGAP-EBB',
 'MDGAP-KINGS',
 'MDGAP-QSBB',
 'NZP3',
 'PAGE',
 'PPMI',
 'S4',
 'SYDBB',
 'SYNAPS-KZ',
 'UMD']

release_key = key.loc[key['study'].isin(cohorts)]
release_key.to_csv('/data/GP2/raw_genotypes/cnvs/release4/release4_key.csv', index=False, header=True)

samples_list = release_key.loc[:,'filename'].unique()
barcodes_list = list(set([x.split('_')[0] for x in samples_list]))

genes = pd.read_csv('/data/CARD/PD/GP2/ref_panel/glist-hg38', sep='\s+', header=None, names=['chr','start','end','symbol'], dtype={'chr':str,'start':int,'end':int})
genes.columns = ['CHR','START','STOP','NAME']
genes_out = genes.loc[genes.CHR.isin(chroms)]
genes_out[['NAME','CHR','START','STOP']].to_csv('/data/CARD/PD/GP2/ref_panel/glist_hg38_intervals.csv', index=False)
# gene_list originally from plink glist
gene_list = '/data/CARD/PD/GP2/ref_panel/glist_hg38_intervals.csv'

raw_geno_path = '/data/GP2/raw_genotypes'
snp_metrics_path = f'{raw_geno_path}/snp_metrics'
idat_path = f'{raw_geno_path}/idats'
plink_file_path = f'{raw_geno_path}/ped_bed'

In [3]:
# len(samples_list)
# sorted(cohorts)
len(release_key)

19057

In [4]:
# get snp metrics
bcftools_plugins_path = 'bin'
with open(f'{swarm_scripts_dir}/snp_metrics.swarm', 'w') as f:
    for code in barcodes_list:
        idat_path_ = f'{idat_path}/{code}'
        metrics_out = f'{snp_metrics_path}'

        cmd = f'\
python run_snp_metrics_pipeline.py \
--idat_path {idat_path_} \
--bpm {bpm} \
--bpm_csv {bpm_csv} \
--egt {egt} \
--ref_fasta {ref_fasta} \
--out_path {metrics_out} \
--iaap {iaap} \
--bcftools_plugins_path {bcftools_plugins_path}'
        f.write(f'{cmd}\n')
f.close()


In [7]:
!swarm -f {swarm_scripts_dir}/snp_metrics.swarm -g 16 -t 16 --time=04:00:00 --logdir {swarm_scripts_dir}/logs --gres=lscratch:20 --partition=norm

57802042


In [4]:
# check if snp metrics produced across all samples
completed_list = []
missing_list = []
for sample in samples_list:
    code = sample.split('_')[0]
    metrics_path = f'{snp_metrics_path}/{code}'
    for chrom in chroms:
        mfile = f'{metrics_path}/snp_metrics_{sample}_chr{chrom}.csv'
        if os.path.isfile(mfile):
            completed_list.append(sample)
        else:
            missing_list.append(sample)
completed_final = list(set(completed_list))
missing_final =list(set(missing_list))


In [5]:
print(len(completed_final))
print(len(missing_final))

18974
83


In [6]:
# now check if we have idats for missing snp metrics
missing_idats = []
exists_idats = []
for sample in missing_final:
    code = sample.split('_')[0]
    grn = f'{idat_path}/{code}/{sample}_Grn.idat'
    red = f'{idat_path}/{code}/{sample}_Red.idat'
    if os.path.isfile(grn) & os.path.isfile(red):
        exists_idats.append(sample)
    else:
        missing_idats.append(sample)

In [7]:
print(len(missing_idats))
print(len(exists_idats))

49
34


In [9]:
release_key

Unnamed: 0,GP2sampleID,SentrixBarcode_A,SentrixPosition_A,study,sample_id,Phenotype,sex,sex_for_qc,race,race_for_qc,study_arm,FID,IID,pheno,filename
767,KOC_000001_s1,206451070115,R01C01,KOC,1620-033,PD,F,2,White,White,affected,0,206451070115_R01C01,2,206451070115_R01C01
768,KOC_000002_s1,206451070115,R02C01,KOC,1620-034,PD,F,2,White,White,affected,0,206451070115_R02C01,2,206451070115_R02C01
769,KOC_000003_s1,206451070115,R03C01,KOC,1620-037,PD,F,2,White,White,affected,0,206451070115_R03C01,2,206451070115_R03C01
770,KOC_000004_s1,206451070115,R04C01,KOC,1620-002,PD,M,1,White,White,affected,0,206451070115_R04C01,2,206451070115_R04C01
771,KOC_000005_s1,206451070115,R05C01,KOC,1620-003,PD,F,2,White,White,affected,0,206451070115_R05C01,2,206451070115_R05C01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28855,MDGAP-EBB_000149_s1,206412050061,R04C01,MDGAP-EBB,EDB_SD051/16,Control,male,1,,Not Reported,Controls,0,206412050061_R04C01,1,206412050061_R04C01
28856,MDGAP-EBB_000150_s1,206412050061,R05C01,MDGAP-EBB,EDB_SD014/18,Control,female,2,,Not Reported,Controls,0,206412050061_R05C01,1,206412050061_R05C01
28857,MDGAP-EBB_000151_s1,206412050061,R06C01,MDGAP-EBB,EDB_SD015/18,Control,male,1,,Not Reported,Controls,0,206412050061_R06C01,1,206412050061_R06C01
28858,MDGAP-EBB_000152_s1,206412050061,R07C01,MDGAP-EBB,EDB_SD028/18,Control,female,2,,Not Reported,Controls,0,206412050061_R07C01,1,206412050061_R07C01


In [7]:
cnv_path = f'/data/GP2/raw_genotypes/cnvs/release4'
# using just autosomes for CNV analysis due to call quality
chroms = [str(i) for i in range(1,23)]

ancestry_labels = [x.split('/')[-1].replace('.bed','').split('_')[-1] for x in glob.glob('/data/GP2/quality_control/release4/genotype_qc/GP2_round4_JAN_24_2023*.bed') if '_maf_hwe' not in x]

release_key[['FID','GP2sampleID']].to_csv(f'{cnv_path}/release4.samples', sep='\t', header=False, index=False)
release_key[['GP2sampleID','IID']].to_csv(f'{cnv_path}/release4_sample_id_key.csv')
release_covars = release_key.loc[:,['FID', 'GP2sampleID','sex_for_qc', 'age', 'age_of_onset']]

cnv_types = ['PERCENT_BAF_INSERTION','PERCENT_L2R_DELETION','PERCENT_L2R_DUPLICATION']

In [23]:
# snp QC for cnvs
# for label in ['EUR']:
for label in ancestry_labels:
    geno_name = f'GP2_round4_JAN_24_2023_{label}'
    out_path = f'{cnv_path}/{geno_name}'
    geno_path = f'/data/GP2/quality_control/release4/genotype_qc/{geno_name}'

    cmd1 = f'\
module load plink/1.9; plink \
--bfile {geno_path} \
--keep {cnv_path}/release4.samples \
--make-bed \
--out {out_path}'

    cmd2 = f'\
module load plink/1.9; plink \
--bfile {out_path} \
--maf 0.01 \
--geno 0.02 \
--hwe 5e-6 \
--autosome \
--make-bed \
--out {out_path}_maf_geno_hwe'

    cmd3 = f'\
module load plink/1.9; \
plink \
--bfile {out_path}_maf_geno_hwe \
--indep-pairwise 1000 10 0.01 \
--autosome \
--out {out_path}_maf_geno_hwe_prune'
    
    cmd4 = f'\
module load plink/1.9; \
plink \
--bfile {out_path}_maf_geno_hwe \
--extract {out_path}_maf_geno_hwe_prune.prune.in \
--make-bed \
--out {out_path}_clean'

    cmd5 = f'\
module load plink/1.9; \
plink \
--bfile {out_path}_clean \
--pca \
--out {out_path}_clean_pcs'

    cmds = [cmd1, cmd2, cmd3, cmd4, cmd5]

    for cmd in cmds:
        !{cmd}

    pcs = pd.read_csv(f'{out_path}_clean_pcs.eigenvec', sep='\s+')
    pc_num = pcs.iloc[:, 2:].shape[1]
    pc_names = ['FID','GP2sampleID'] + [f'PC{i}' for i in range(1, pc_num+1)]
    pcs.columns = pc_names

    cov = pcs.merge(release_covars, on=['FID','GP2sampleID'], how='left')
    cov.age.fillna(cov.age.mean(), inplace=True)
    cov.age_of_onset.fillna(cov.age_of_onset.mean(), inplace=True)
    cov.sex_for_qc.fillna(cov.sex_for_qc.median(), inplace=True)
    cov.rename(columns={'GP2sampleID':'sampleid','sex_for_qc':'sex'}, inplace=True)
    cov.to_csv(f'{out_path}_clean.cov', sep='\t', header=True, index=False)

    samples = cov.merge(release_key[['GP2sampleID','IID']], left_on='sampleid', right_on='GP2sampleID', how='left')
    samples['IID'].to_csv(f'{out_path}_clean_barcode.samples', header=False, index=False)

    bim = pd.read_csv(f'{out_path}_clean.bim', sep='\s+', header=None, names=['chr','id','pos','bp','a1','a2'], usecols=['id'])
    bim.to_csv(f'{out_path}_clean.snps', sep='\t', header=False, index=False)


[-] Unloading plink  3.6-alpha 
[+] Loading plink  1.9  on cn4285 

The following have been reloaded with a version change:
  1) plink/3.6-alpha => plink/1.9

PLINK v1.90b3.36 64-bit (31 Mar 2016)      https://www.cog-genomics.org/plink2
(C) 2005-2016 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to /data/GP2/raw_genotypes/cnvs/release4/GP2_round4_JAN_24_2023_MDE.log.
Options in effect:
  --bfile /data/GP2/quality_control/release4/genotype_qc/GP2_round4_JAN_24_2023_MDE
  --keep /data/GP2/raw_genotypes/cnvs/release4/release4.samples
  --make-bed
  --out /data/GP2/raw_genotypes/cnvs/release4/GP2_round4_JAN_24_2023_MDE

515537 MB RAM detected; reserving 257768 MB for main workspace.
1945776 variants loaded from .bim file.
128 people (91 males, 37 females) loaded from .fam.
97 phenotype values loaded from .fam.
--keep: 121 people remaining.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 121 founders and 0 nonfounders present.

In [60]:
for label in ancestry_labels:
# for label in ['EUR']:
    out_name = f'GP2_round4_JAN_24_2023_{label}'
    out_path = f'{cnv_path}/{out_name}'
    fam = pd.read_csv(f'{out_path}_clean.fam', sep='\s+', header=None, names=['fid','iid','pat','mat','sex','pheno'])
    pheno_counts = fam.pheno.value_counts()
    ncases = pheno_counts[2]
    ncontrols = pheno_counts[1]
    nsnps = !cat {out_path}_clean.bim | wc -l
    print(f'{label} | n_cases: {ncases} | n_controls: {ncontrols} | n_snps: {int(nsnps[0])}')
    print()

MDE | n_cases: 77 | n_controls: 13 | n_snps: 2276

AMR | n_cases: 259 | n_controls: 208 | n_snps: 4381

AJ | n_cases: 644 | n_controls: 370 | n_snps: 4720

AAC | n_cases: 170 | n_controls: 1122 | n_snps: 13840

EUR | n_cases: 8066 | n_controls: 4371 | n_snps: 14629

EAS | n_cases: 111 | n_controls: 33 | n_snps: 2244

CAS | n_cases: 115 | n_controls: 196 | n_snps: 3807

SAS | n_cases: 56 | n_controls: 18 | n_snps: 1849

FIN | n_cases: 9 | n_controls: 1 | n_snps: 1331

AFR | n_cases: 39 | n_controls: 362 | n_snps: 6784



In [27]:

with open(f'{swarm_scripts_dir}/cnvs.swarm', 'w') as f:
    for label in ancestry_labels:
    # for label in ['EUR']:
        out_name = f'GP2_round4_JAN_24_2023_{label}'
        geno_path = f'{cnv_path}/{out_name}_clean'
        bim = f'{geno_path}.bim'
        samples = pd.read_csv(f'{geno_path}_barcode.samples', header=None, names=['IID'])
        
        for sample in samples.IID.unique():
            code = sample.split('_')[0]
            for chrom in chroms:
            
                mfile = f'{snp_metrics_path}/{code}/snp_metrics_{sample}_chr{chrom}.csv'

                cnv_out = f'{cnv_path}/CNV_{label}_{sample}_chr{chrom}.csv'
                intervals = f'/data/CARD/PD/GP2/ref_panel/glist_hg38_chr{chrom}.csv'

                cmd = f'\
python run_cnv_pipeline.py \
--metrics {mfile} \
--bim {bim} \
--out_path {cnv_out} \
--intervals {intervals} \
--min_variants 10 \
--kb_window 250 \
--min_gentrain 0.2'
            
                f.write(f'{cmd}\n')
f.close()

In [29]:
!swarm -f {swarm_scripts_dir}/cnvs.swarm -g 4 -t 8 --time=00:30:00 --logdir {swarm_scripts_dir}/logs --module python/3.8 --gres=lscratch:20 --partition=norm

57895756


In [30]:
# write dosages per chromosome per ancestry per cnv-type

with open(f'{swarm_scripts_dir}/cnv_dosages.swarm', 'w') as f:
    for label in ancestry_labels:
        for chrom in chroms:

            cnv_files_list = glob.glob(f'{cnv_path}/CNV_{label}_*_chr{chrom}.csv')
            cnv_files_df = pd.DataFrame({'filename':cnv_files_list})
            cnv_files_df.to_csv(f'{cnv_path}/CNV_{label}_chr{chrom}_files.csv', header=False, index=False)

            for cnv_type in cnv_types:
                
                cmd = f'\
python /data/vitaled2/CNVs/run_cnv_dosage_pipeline.py \
--files {cnv_path}/CNV_{label}_chr{chrom}_files.csv \
--label {label} \
--chrom {chrom} \
--cnv_type {cnv_type} \
--out_path {cnv_path}/CNV_{label}_chr{chrom}_{cnv_type}.csv'
                f.write(f'{cmd}\n')
f.close()


In [32]:
!swarm -f {swarm_scripts_dir}/cnv_dosages.swarm -g 32 -t 16 --logdir {swarm_scripts_dir}/logs --module python/3.8 --gres=lscratch:20 --partition=norm

57901894


In [4]:
cnv_dosage = pd.read_csv('/data/GP2/raw_genotypes/cnvs/release4/CNV_EUR_204958250147_R03C01_chr1.csv')
# cnv_dosage[cnv_dosage.NUM_VARIANTS>250]
cnv_dosage.NUM_VARIANTS.std()
cnv_dosage.NUM_VARIANTS.mean()

2.6910852713178293

In [5]:
cnv_dosage[cnv_dosage.NUM_VARIANTS>10]

Unnamed: 0,INTERVAL,NUM_VARIANTS,PERCENT_BAF_INSERTION,PERCENT_L2R_DELETION,PERCENT_L2R_DUPLICATION,START_PLUS_WINDOW,START,STOP,STOP_PLUS_WINDOW
104,ARHGEF16,13,0.0,0.0,0.0,3204582,3454582,3481113,3731113
264,CAMTA1,12,0.0,0.0,0.0,6535323,6785323,7769706,8019706
416,CSMD2,13,0.0,0.0,0.230769,33263998,33513998,34165842,34415842
440,DAB1,11,0.0,0.0,0.090909,56747905,56997905,58250539,58500539
563,ERO1LB,11,0.0,0.0,0.090909,235965121,236215121,236282039,236532039
631,FBXO6,11,0.0,0.0,0.0,11414092,11664092,11674352,11924352
671,FMN2,14,0.0,0.0,0.0,239841884,240091884,240475189,240725189
771,GREM2,11,0.0,0.0,0.0,240239572,240489572,240612162,240862162
880,IGSF21,12,0.0,0.166667,0.0,17857745,18107745,18378483,18628483
921,KAZN,14,0.0,0.0,0.142857,14348716,14598716,15118048,15368048


In [41]:
with open(f'{swarm_scripts_dir}/update_cnv_ids.swarm','w') as f:
    for label in ancestry_labels:
        for cnv_type in cnv_types:
            for chrom in chroms:

                dosagefile = f'{cnv_path}/CNV_{label}_chr{chrom}_{cnv_type}.csv'
                dosagefile_out = f'{cnv_path}/CNV_{label}_chr{chrom}_{cnv_type}_gp2ids.csv'
                pheno_out = f'{cnv_path}/GP2_{label}_chr{chrom}_{cnv_type}.pheno'
                cmd = f'\
python update_cnv_ids.py \
--dosagefile {dosagefile} \
--key /data/GP2/raw_genotypes/cnvs/release4/release4_key.csv \
--pheno_out {pheno_out} \
--out_path {dosagefile_out}'
                f.write(f'{cmd}\n')
f.close()

In [44]:
!swarm -f {swarm_scripts_dir}/update_cnv_ids.swarm -g 8 -t 8 --logdir {swarm_scripts_dir}/logs --module python/3.8 --gres=lscratch:20 --partition=norm

57903129


In [8]:
cnv_out_path = '/data/GP2/raw_genotypes/cnvwas'
with open(f'{swarm_scripts_dir}/cnvwas.swarm', 'w') as f:
    for label in ancestry_labels:
        for cnv_type in cnv_types:
            for chrom in chroms:

                dosagefile = f'{cnv_path}/CNV_{label}_chr{chrom}_{cnv_type}_gp2ids.csv'
                pheno = f'{cnv_path}/GP2_{label}_chr{chrom}_{cnv_type}.pheno'
                covar = f'{cnv_path}/GP2_round4_JAN_24_2023_{label}_clean.cov'
                out_path = f'{cnv_out_path}/GP2_{label}_chr{chrom}_{cnv_type}_cnvwas.tab'

                cmd = f'\
python run_cnvwas_pipeline.py \
--cnv_dosage_file {dosagefile} \
--out_path {out_path} \
--pheno {pheno} \
--covar {covar}'
            
                f.write(f'{cmd}\n')
f.close()


In [57]:
!swarm -f {swarm_scripts_dir}/cnvwas.swarm -g 32 -t 16 --time=00:30:00 --logdir {swarm_scripts_dir}/logs --gres=lscratch:20 --partition=norm

57905424


In [43]:
label = 'EUR'
# cnv_type = 'PERCENT_BAF_INSERTION'
cnv_type = 'PERCENT_L2R_DELETION'
chrom = '1'

# for cnv_type in cnv_types:
total_cnvtype_df = pd.DataFrame()
for chrom in chroms:
    tab = f'{cnv_out_path}/GP2_{label}_chr{chrom}_{cnv_type}_cnvwas.tab'

    if os.path.isfile(tab):
        sumstats = pd.read_csv(tab, sep='\s+', engine='python')
        total_cnvtype_df = pd.concat([total_cnvtype_df, sumstats], ignore_index=True)
    else:
        print(f'{tab} does not exist!')





In [44]:
from GWAS.gwas import calculate_inflation
pheno = pd.read_csv(f'{cnv_path}/GP2_EUR_chr1_{cnv_type}.pheno', sep='\t', engine='python')
ncontrols = pheno.pheno.value_counts()[0]
ncases = pheno.pheno.value_counts()[1]

infl = calculate_inflation(total_cnvtype_df.P_VAL, normalize=True, ncases=ncases, ncontrols=ncontrols)
infl


RUNNING: lambda_calculation



{'pass': True,
 'step': 'lambda_calculation',
 'metrics': {'inflation': 21.636334545354355}}

In [41]:
pheno.pheno.value_counts()[1]

8066

1    8066
0    4370
Name: pheno, dtype: int64

In [40]:
fam = pd.read_csv(f'/data/GP2/quality_control/release4/genotype_qc/GP2_round4_JAN_24_2023_{label}.fam', sep='\t', header=None, names=['fid','iid','pat','mat','sex','pheno'], engine='python')
fam.pheno.value_counts()

 2    8711
 1    4876
-9    1475
Name: pheno, dtype: int64

In [13]:
# output final release samples
cnv_out_path = '/data/CARD/PD/GP2/genotypes/GP2/round3/GP2_cnvs'
release3_samples = '/data/CARD/PD/GP2/genotypes/GP2/round3/clean/release3.samples'
release_key_final = pd.read_csv('/data/CARD/PD/GP2/genotypes/GP2/round3/clean/release3_master_key.csv')
release_key_out = release_key_final.loc[release_key_final.pheno!=-9]
release_key_out['GP2sampleID'].to_csv(release3_samples, index=False, header=False)

In [20]:
with open(f'{swarm_scripts_dir}/cnv_dosage_release_split.swarm', 'w') as f:
    for label in ancestry_labels:
        sample_file = f'/data/CARD/PD/GP2/genotypes/GP2/round3/clean/{label}_release3.samples'
        label_samples = release_key_final.loc[release_key_final['label']==label]
        label_samples['GP2sampleID'].to_csv(sample_file, header=False, index=False)

        for cnv_type in cnv_types:
            for chrom in chroms:
                
                dosagefile = f'{cnv_path}/CNV_{label}_chr{chrom}_{cnv_type}_gp2ids.csv'
                dosage_out = f'{cnv_out_path}/CNV_{label}_chr{chrom}_{cnv_type}_release3.csv'
                
                cmd = f'\
python split_release_samples.py \
--cnv_dosage_file {dosagefile} \
--samples {sample_file} \
--out_path {dosage_out}'
            
                f.write(f'{cmd}\n')
f.close()


In [21]:
!swarm -f {swarm_scripts_dir}/cnv_dosage_release_split.swarm -g 8 -t 8 --time=00:30:00 --logdir {swarm_scripts_dir}/logs --gres=lscratch:20 --partition=norm

53715605


In [14]:
# CHECK COUNTS
total_dosages = 0
for label in ancestry_labels:
    for cnv_type in cnv_types:
        dosage_out = f'{cnv_out_path}/CNV_{label}_chr1_{cnv_type}_release3.csv'
        print(label, cnv_type)
        label_count = !cat {dosage_out} | wc -l
        label_count = int(label_count[0])
        total_dosages += label_count
        print(label_count)
        print()

AMR PERCENT_BAF_INSERTION
412

AMR PERCENT_L2R_DELETION
412

AMR PERCENT_L2R_DUPLICATION
412

EUR PERCENT_BAF_INSERTION
10727

EUR PERCENT_L2R_DELETION
10727

EUR PERCENT_L2R_DUPLICATION
10727

SAS PERCENT_BAF_INSERTION
59

SAS PERCENT_L2R_DELETION
59

SAS PERCENT_L2R_DUPLICATION
59

MDE PERCENT_BAF_INSERTION
62

MDE PERCENT_L2R_DELETION
62

MDE PERCENT_L2R_DUPLICATION
62

EAS PERCENT_BAF_INSERTION
137

EAS PERCENT_L2R_DELETION
137

EAS PERCENT_L2R_DUPLICATION
137

AAC PERCENT_BAF_INSERTION
1215

AAC PERCENT_L2R_DELETION
1215

AAC PERCENT_L2R_DUPLICATION
1215

AFR PERCENT_BAF_INSERTION
404

AFR PERCENT_L2R_DELETION
404

AFR PERCENT_L2R_DUPLICATION
404

CAS PERCENT_BAF_INSERTION
303

CAS PERCENT_L2R_DELETION
303

CAS PERCENT_L2R_DUPLICATION
303

AJ PERCENT_BAF_INSERTION
923

AJ PERCENT_L2R_DELETION
923

AJ PERCENT_L2R_DUPLICATION
923



In [22]:
# push to release bucket
with open(f'{swarm_scripts_dir}/gcp_push_cnvs.swarm','w') as f:
    
    for label in ancestry_labels:
        for chrom in chroms:
    
#         !module load google-cloud-sdk/397.0.0; gsutil cp {cnv_out_path}/CNV_{label}_BAF_INSERTION_release3.csv gs://gp2tier2/release3_31102022/cnvs/{label}/
            cp1 = f'gsutil cp {cnv_out_path}/CNV_{label}_chr{chrom}_PERCENT_BAF_INSERTION_release3.csv gs://gp2tier2/release3_31102022/cnvs/{label}/'
            cp2 = f'gsutil cp {cnv_out_path}/CNV_{label}_chr{chrom}_PERCENT_L2R_DELETION_release3.csv gs://gp2tier2/release3_31102022/cnvs/{label}/'
            cp3 = f'gsutil cp {cnv_out_path}/CNV_{label}_chr{chrom}_PERCENT_L2R_DUPLICATION_release3.csv gs://gp2tier2/release3_31102022/cnvs/{label}/'

            cps = [cp1, cp2, cp3]

            for cp in cps:
                f.write(f'{cp}\n')
f.close()

In [23]:
!swarm -f {swarm_scripts_dir}/gcp_push_cnvs.swarm -g 8 -t 8 --time=00:30:00 --logdir {swarm_scripts_dir}/logs --module google-cloud-sdk/397.0.0 --gres=lscratch:20 --partition=norm

53719849


In [24]:
!module load google-cloud-sdk/397.0.0; gsutil cp /data/CARD/PD/GP2/genotypes/GP2/round3/GP2_cnvs/CNV_AMR_chr1_BAF_INSERTION_release3.csv gs://gp2tier2/release3_31102022/cnvs/AMR/

594


In [24]:
!ls -lh {cnv_out_path}/CNV_{label}_chr{chrom}_PERCENT_BAF_INSERTION_release3.csv

-rw-rw----+ 1 vitaled2 CARD 4.2K Dec  5 19:27 /data/CARD/PD/GP2/genotypes/GP2/round3/GP2_cnvs/CNV_AJ_chr22_PERCENT_BAF_INSERTION_release3.csv


In [15]:
!cat /data/CARD/PD/GP2/raw_genotypes/GP2_cnvs/release3/CNV_EUR_chr1_PERCENT_BAF_INSERTION_gp2ids.csv | wc -l

11696
