# GP2 CNVs Pipeline

steps:
1. call snp metrics from idats- write to parquets partitioned on Sample_ID
2. run qc on snps from corresponding genotypes
3. call cnvs from snp metrics using snps that passed qc in step 2
4. 

In [1]:
import pandas as pd
import os
import shutil
import numpy as np
import glob
from sklearn.preprocessing import MinMaxScaler
import statsmodels.api as sm

from QC.utils import shell_do

Plink is found
Plink2 is found


In [2]:
############ SET THIS EVERY RELEASE ############
release = '5'

key_file = f'/data/GP2/clinical/master_key/GP2_master_key_FINAL_release5_n26728.txt'
key = pd.read_csv(f'{key_file}')
key.loc[:,'IID'] = key.loc[:,'SentrixBarcode_A'].astype(str) + '_' + key.loc[:,'SentrixPosition_A'].astype(str)
key.loc[:,'FID'] = "0"


idat_path = '/data/GP2/raw_genotypes/idats/'
swarm_scripts_dir = f'/data/GP2/users/vitaled2/swarm'

ilmn_files_path = '/data/GP2/utils/ilmn_files'
# A1 is hg19, A2 is hg38. csv needed for indel calls
bpm_csv = f'{ilmn_files_path}/NeuroBooster_20042459_A2.csv' 
bpm = f'{ilmn_files_path}/NeuroBooster_20042459_A2.bpm'
egt = f'{ilmn_files_path}/recluster_09272022.egt'
iaap = f'{ilmn_files_path}/iaap-cli/iaap-cli'

ref_fasta = '/data/vitaled2/ref/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna'

chroms = [str(i) for i in range(1,23)] + ['X','Y','M']

samples_list = key.loc[:,'filename'].unique()
barcodes_list = list(set([x.split('_')[0] for x in samples_list]))


raw_geno_path = '/data/GP2/raw_genotypes'
snp_metrics_path = f'{raw_geno_path}/snp_metrics'
idat_path = f'{raw_geno_path}/idats'
plink_file_path = f'{raw_geno_path}/ped_bed'


cnv_path = f'/data/GP2/raw_genotypes/cnvs/release{release}'
covar_path = f'{cnv_path}/key/release{release}_covar.csv'
# using just autosomes for CNV analysis due to call quality
chroms = [str(i) for i in range(1,23)]

ancestry_labels = [x.split('/')[-1].replace('.bed','').split('_')[-1] for x in glob.glob(f'/data/GP2/quality_control/release{release}/genotype_qc/GP2_round5_APRIL_2023_*.bed') if '_maf_hwe' not in x]

key[['FID','GP2sampleID']].to_csv(f'{cnv_path}/key/release{release}.samples', sep='\t', header=False, index=False)
key[['GP2sampleID','IID']].to_csv(f'{cnv_path}/key/release{release}_sample_id_key.csv')
release_covars = key.loc[:,['FID', 'GP2sampleID','sex_for_qc', 'age', 'age_of_onset']]
release_covars.to_csv(covar_path)

cnv_types = ['PERCENT_BAF_INSERTION','PERCENT_L2R_DELETION','PERCENT_L2R_DUPLICATION']

In [5]:
# get snp metrics
metrics_exist_list = []
bcftools_plugins_path = 'bin'
with open(f'{swarm_scripts_dir}/snp_metrics.swarm', 'w') as f:
    for code in barcodes_list:
        idat_path_ = f'{idat_path}/{code}'
        metrics_out = f'{snp_metrics_path}'
        # metrics_code_path = f'{snp_metrics_path}/{code}'
        cmd = f'\
python run_snp_metrics_pipeline.py \
--idat_path {idat_path_} \
--bpm {bpm} \
--bpm_csv {bpm_csv} \
--egt {egt} \
--ref_fasta {ref_fasta} \
--out_path {metrics_out} \
--iaap {iaap} \
--bcftools_plugins_path {bcftools_plugins_path}'
        f.write(f'{cmd}\n')
f.close()


In [7]:
!swarm -f {swarm_scripts_dir}/snp_metrics.swarm -g 64 -t 16 --time=04:00:00 --logdir {swarm_scripts_dir}/logs --gres=lscratch:20 --partition=norm

64432484


In [5]:
!ls /data/GP2/raw_genotypes/snp_metrics/206046190071/snp_metrics_206046190071.parquet

Sample_ID=206046190071_R01C01  Sample_ID=206046190071_R02C01


In [6]:
exist_dir = []
missing_dir = []

for sample in samples_list:
# for sample in ['206412060174_R07C01']:
    code = sample.split('_')[0]
    metrics_file = f'/data/GP2/raw_genotypes/snp_metrics/{code}/snp_metrics_{code}.parquet/Sample_ID={sample}'
    
    if os.path.isdir(metrics_file):
        exist_dir.append(sample)
    else:
        missing_dir.append(sample)
        
print(f'metrics exist: {len(exist_dir)}')
print(f'missing metrics: {len(missing_dir)}')

metrics exist: 26578
missing metrics: 150


In [7]:
# now check if we have idats for missing snp metrics
missing_idats = []
exists_idats = []
for sample in missing_metrics:
    code = sample.split('_')[0]
    grn = f'{idat_path}/{code}/{sample}_Grn.idat'
    red = f'{idat_path}/{code}/{sample}_Red.idat'
    if os.path.isfile(grn) & os.path.isfile(red):
        exists_idats.append(sample)
    else:
        missing_idats.append(sample)

In [8]:
print(len(missing_idats))
print(len(exists_idats))

93
57


In [13]:
with open(f'{swarm_scripts_dir}/cnv_geno_qc.swarm', 'w') as f:

    for label in ancestry_labels:
        geno_name = f'GP2_round5_APRIL_2023_{label}'
        out_path = f'{cnv_path}/genotypes/{geno_name}_clean'
        geno_path = f'/data/GP2/quality_control/release{release}/genotype_qc/{geno_name}'
    
        cmd = f'\
python run_cnv_qc_pipeline.py \
--geno_path {geno_path} \
--out_path {out_path} \
--covar_path {covar_path} \
--maf 0.01 \
--geno 0.2 \
--hwe 5e-6 \
--indep_pairwise 1000 10 0.01 \
--samples_path {cnv_path}/key/release{release}.samples'
        
        f.write(f'{cmd}\n')
f.close()
!cat {swarm_scripts_dir}/cnv_geno_qc.swarm


python run_cnv_qc_pipeline.py --geno_path /data/GP2/quality_control/release5/genotype_qc/GP2_round5_APRIL_2023_FIN --out_path /data/GP2/raw_genotypes/cnvs/release5/genotypes/GP2_round5_APRIL_2023_FIN_clean --covar_path /data/GP2/raw_genotypes/cnvs/release5/key/release5_covar.csv --maf 0.01 --geno 0.2 --hwe 5e-6 --indep_pairwise 1000 10 0.01 --samples_path /data/GP2/raw_genotypes/cnvs/release5/key/release5.samples
python run_cnv_qc_pipeline.py --geno_path /data/GP2/quality_control/release5/genotype_qc/GP2_round5_APRIL_2023_EUR --out_path /data/GP2/raw_genotypes/cnvs/release5/genotypes/GP2_round5_APRIL_2023_EUR_clean --covar_path /data/GP2/raw_genotypes/cnvs/release5/key/release5_covar.csv --maf 0.01 --geno 0.2 --hwe 5e-6 --indep_pairwise 1000 10 0.01 --samples_path /data/GP2/raw_genotypes/cnvs/release5/key/release5.samples
python run_cnv_qc_pipeline.py --geno_path /data/GP2/quality_control/release5/genotype_qc/GP2_round5_APRIL_2023_EAS --out_path /data/GP2/raw_genotypes/cnvs/release5/ge

In [28]:
!swarm -f {swarm_scripts_dir}/cnv_geno_qc.swarm -g 16 -t 16 --time=02:00:00 --logdir {swarm_scripts_dir}/logs --module python/3.8,plink/1.9 --gres=lscratch:20 --partition=norm


63958236


In [9]:
# check outputs from qc pipeline

for label in ancestry_labels:
    extensions = ['fam','cov','eigenvec','bim', 'snps']
    out_geno = f'/data/GP2/raw_genotypes/cnvs/release5/genotypes/GP2_round5_APRIL_2023_{label}_clean.bed'
    print(f'{out_geno}')
    for ext in extensions:
        out_geno_path = f'/data/GP2/raw_genotypes/cnvs/release5/genotypes/GP2_round5_APRIL_2023_{label}_clean.{ext}'
        if os.path.isfile(out_geno):
            count = !cat {out_geno_path} | wc -l
            print(f'{ext}: {count}') 
            
        else:
            print(f'{out_geno} does not exist!!!')
    print()
    print()


/data/GP2/raw_genotypes/cnvs/release5/genotypes/GP2_round5_APRIL_2023_FIN_clean.bed
fam: ['16']
cov: ['16']
eigenvec: ['16']
bim: ['1152']
snps: ['1152']


/data/GP2/raw_genotypes/cnvs/release5/genotypes/GP2_round5_APRIL_2023_EUR_clean.bed
fam: ['15356']
cov: ['15356']
eigenvec: ['15356']
bim: ['14573']
snps: ['14573']


/data/GP2/raw_genotypes/cnvs/release5/genotypes/GP2_round5_APRIL_2023_EAS_clean.bed
fam: ['2913']
cov: ['2913']
eigenvec: ['2913']
bim: ['17255']
snps: ['17255']


/data/GP2/raw_genotypes/cnvs/release5/genotypes/GP2_round5_APRIL_2023_AAC_clean.bed
fam: ['1331']
cov: ['1331']
eigenvec: ['1331']
bim: ['13110']
snps: ['13110']


/data/GP2/raw_genotypes/cnvs/release5/genotypes/GP2_round5_APRIL_2023_AJ_clean.bed
fam: ['1546']
cov: ['1546']
eigenvec: ['1546']
bim: ['4747']
snps: ['4747']


/data/GP2/raw_genotypes/cnvs/release5/genotypes/GP2_round5_APRIL_2023_MDE_clean.bed
fam: ['149']
cov: ['149']
eigenvec: ['149']
bim: ['2672']
snps: ['2672']


/data/GP2/raw_genotypes/cnvs/

In [19]:

total_sample_n = 0
for label in ancestry_labels:
    out_name = f'GP2_round5_APRIL_2023_{label}_clean'
    out_path = f'{cnv_path}/genotypes/{out_name}'
    fam = pd.read_csv(f'{out_path}.fam', sep='\s+', header=None, names=['fid','iid','pat','mat','sex','pheno'])
    pheno_counts = fam.pheno.value_counts()
    ncases = pheno_counts[2]
    ncontrols = pheno_counts[1]
    ntotal = ncases + ncontrols
    total_sample_n += ntotal
    nsnps = !cat {out_path}.bim | wc -l
    print(f'{label} | total_n: {ntotal} | n_cases: {ncases} | n_controls: {ncontrols} | n_snps: {int(nsnps[0])}')
    print()

FIN | total_n: 13 | n_cases: 10 | n_controls: 3 | n_snps: 1152

EUR | total_n: 14196 | n_cases: 9230 | n_controls: 4966 | n_snps: 14573

EAS | total_n: 2900 | n_cases: 878 | n_controls: 2022 | n_snps: 17255

AAC | total_n: 1320 | n_cases: 289 | n_controls: 1031 | n_snps: 13110

AJ | total_n: 1126 | n_cases: 740 | n_controls: 386 | n_snps: 4747

MDE | total_n: 124 | n_cases: 106 | n_controls: 18 | n_snps: 2672

SAS | total_n: 215 | n_cases: 80 | n_controls: 135 | n_snps: 3273

CAS | total_n: 474 | n_cases: 195 | n_controls: 279 | n_snps: 5367

AMR | total_n: 550 | n_cases: 345 | n_controls: 205 | n_snps: 5020

AFR | total_n: 2343 | n_cases: 855 | n_controls: 1488 | n_snps: 18843



In [18]:
missing_metrics = []
with open(f'{swarm_scripts_dir}/cnvs.swarm', 'w') as f:
    for label in ancestry_labels:
    # for label in ['EUR']:
        out_name = f'GP2_round5_APRIL_2023_{label}_clean'
        geno_path = f'{cnv_path}/genotypes/{out_name}'
        bim_path = f'{geno_path}.bim'
        fam = pd.read_csv(f'{geno_path}.fam', sep='\s+', header=None, names=['FID','IID','pat','mat','sex','pheno'])
        fam_key = fam.merge(key, left_on='IID', right_on='GP2sampleID', how='left')

        # samples = pd.read_csv(f'{geno_path}_barcode.samples', header=None, names=['IID'])
        label_dir = f'{cnv_path}/cnv_calls/{label}'
        os.makedirs(label_dir, exist_ok=True)
        for sample in fam_key.filename.unique():
            code = sample.split('_')[0]

            # mfile = f'{snp_metrics_path}/{code}/snp_metrics_{sample}_chr{chrom}.parquet'
            mfile = f'/data/GP2/raw_genotypes/snp_metrics/{code}/snp_metrics_{code}.parquet/Sample_ID={sample}'
            
            if os.path.isdir(mfile):
                cnv_out = f'{label_dir}/CNV_{label}_{sample}.parquet'
                intervals = f'/data/GP2/utils/ref_dir/glist_hg38_intervals.csv'

                cmd = f'\
python run_cnv_pipeline.py \
--metrics {mfile} \
--bim {bim_path} \
--out_path {cnv_out} \
--intervals {intervals} \
--min_variants 10 \
--kb_window 250 \
--min_gentrain 0.2'

                f.write(f'{cmd}\n')
            
            else:
                missing_metrics.append(sample)
f.close()

In [27]:
!swarm -f {swarm_scripts_dir}/cnvs.swarm -g 16 -t 8 --time=00:15:00 --logdir {swarm_scripts_dir}/logs --module python/3.8 --gres=lscratch:20 --partition=norm

64688128


In [7]:
# test_cnv = pd.read_parquet('/data/GP2/raw_genotypes/cnvs/release5/cnv_calls/EUR/CNV_EUR_204620380001_R01C01.parquet')
test_cnv[test_cnv.NUM_VARIANTS>10]

Unnamed: 0,INTERVAL,NUM_VARIANTS,PERCENT_BAF_INSERTION,PERCENT_L2R_DELETION,PERCENT_L2R_DUPLICATION,START_PLUS_WINDOW,START,STOP,STOP_PLUS_WINDOW
85,ABCG1,13,0.000000,0.000000,0.000000,41949688,42199688,42297244,42547244
121,ABR,12,0.000000,0.000000,0.166667,753517,1003517,1187322,1437322
190,ACPT,16,0.000000,0.000000,0.000000,50540414,50790414,50795224,51045224
237,ACTR3B,13,0.076923,0.000000,0.076923,152509748,152759748,152855379,153105379
291,ADAMTS2,13,0.000000,0.000000,0.076923,178860850,179110850,179345430,179595430
...,...,...,...,...,...,...,...,...,...
25314,ZNF556,13,0.000000,0.076923,0.153846,2617334,2867334,2878503,3128503
25476,ZNF761,17,0.000000,0.000000,0.117647,53181973,53431973,53458261,53708261
25479,ZNF765,16,0.000000,0.000000,0.125000,53145143,53395143,53412009,53662009
25515,ZNF813,17,0.000000,0.000000,0.117647,53217734,53467734,53494292,53744292


In [13]:
samples_list

array(['206966380021_R01C01', '206966380021_R02C01',
       '206966380021_R03C01', ..., '206412050061_R06C01',
       '206412050061_R07C01', '206412050061_R08C01'], dtype=object)

In [18]:
!ls {cnv_path}/cnv_calls/EUR/CNV_EUR_204620380001_R01C01.parquet

/data/GP2/raw_genotypes/cnvs/release5/cnv_calls/EUR/CNV_EUR_204620380001_R01C01.parquet


In [21]:
# for label in ancestry_labels:
for label in ['EUR']:
    cnv_dir_ = f'{cnv_path}/cnv_calls/EUR/'
    label_cnvs_list = glob.glob(f'{cnv_dir_}/*.parquet')
    

In [23]:
len(label_cnvs_list)

15356

In [24]:
cnv_types

['PERCENT_BAF_INSERTION', 'PERCENT_L2R_DELETION', 'PERCENT_L2R_DUPLICATION']

In [25]:
# write dosages per chromosome per ancestry per cnv-type

with open(f'{swarm_scripts_dir}/cnv_dosages.swarm', 'w') as f:
    for label in ancestry_labels:
        label_dir = f'{cnv_path}/dosages/{label}'
        os.makedirs(label_dir, exist_ok=True)

        cnv_files_list = glob.glob(f'{cnv_path}/cnv_calls/{label}/CNV_{label}_*.parquet')
        cnv_files_df = pd.DataFrame({'filename': cnv_files_list})
        cnv_files_df.to_csv(f'{label_dir}/CNV_{label}_files.csv', header=False, index=False)

        for cnv_type in cnv_types:

            cmd = f'\
python /data/vitaled2/CNVs/run_cnv_dosage_pipeline.py \
--files {label_dir}/CNV_{label}_files.csv \
--label {label} \
--cnv_type {cnv_type} \
--out_path {label_dir}/CNV_{label}_{cnv_type}.csv'
            f.write(f'{cmd}\n')
f.close()


In [29]:
!swarm -f {swarm_scripts_dir}/cnv_dosages.swarm -g 8 -t 4 --logdir {swarm_scripts_dir}/logs --module python/3.8 --gres=lscratch:20 --partition=norm

64981537


In [48]:
!head /data/GP2/clinical/master_key/GP2_master_key_FINAL_release5_n26728.txt

GP2ID,GP2sampleID,manifest_id,phenotype,pheno_for_qc,other_pheno,sex_for_qc,age,age_of_onset,age_at_diagnosis,age_at_death,race_for_qc,family_history,region_for_qc,study,pruned,pruned_reason,label,related,SentrixBarcode_A,SentrixPosition_A,filename
IMMUNEPD_000001,IMMUNEPD_000001_s1,m1,PD,2,PD,1,61.0,,61.0,,White,Yes,USA,IMMUNEPD,0,,EUR,,206966380021,R01C01,206966380021_R01C01
IMMUNEPD_000002,IMMUNEPD_000002_s1,m1,PD,2,PD,1,66.0,,66.0,,White,Yes,USA,IMMUNEPD,0,,EUR,,206966380021,R02C01,206966380021_R02C01
IMMUNEPD_000003,IMMUNEPD_000003_s1,m1,Control,1,Control,2,55.0,,,,White,No,USA,IMMUNEPD,0,,EUR,,206966380021,R03C01,206966380021_R03C01
IMMUNEPD_000004,IMMUNEPD_000004_s1,m1,Control,1,Control,1,50.0,,,,White,No,USA,IMMUNEPD,0,,EUR,,206966380021,R04C01,206966380021_R04C01
IMMUNEPD_000005,IMMUNEPD_000005_s1,m1,Control,1,Control,2,74.0,,,,White,No,USA,IMMUNEPD,0,,EUR,,206966380021,R05C01,206966380021_R05C01
IMMUNEPD_000006,IMMUNEPD_000006_s1,m1,PD,2,PD,1,71.0,,69.0,,White,No,USA,IMMUNEPD

In [58]:
with open(f'{swarm_scripts_dir}/update_cnv_ids.swarm','w') as f:
    for label in ancestry_labels:
        label_dir = f'{cnv_path}/dosages/{label}'
        out_dir = f'/data/GP2/releases/gp2tier2/release5_11052023/cnvs/{label}'
        os.makedirs(out_dir, exist_ok=True)
        for cnv_type in cnv_types:

            dosagefile = f'{label_dir}/CNV_{label}_{cnv_type}.csv'
            dosagefile_out = f'{out_dir}/CNV_{label}_{cnv_type}_gp2ids.csv'
            pheno_out = f'{label_dir}/GP2_{label}_{cnv_type}.pheno'
            cmd = f'\
python update_cnv_ids.py \
--dosagefile {dosagefile} \
--key /data/GP2/clinical/master_key/GP2_master_key_FINAL_release5_n26728.txt \
--pheno_out {pheno_out} \
--out_path {dosagefile_out}'
            f.write(f'{cmd}\n')
f.close()

In [59]:
!swarm -f {swarm_scripts_dir}/update_cnv_ids.swarm -g 8 -t 8 --logdir {swarm_scripts_dir}/logs --module python/3.8 --gres=lscratch:20 --partition=norm

65062074


In [93]:
# !cat {swarm_scripts_dir}/logs/swarm_65062074_0.o
# dosagefile
# !head /data/GP2/raw_genotypes/cnvs/release5/dosages/FIN/CNV_FIN_PERCENT_BAF_INSERTION.csv
# !head /data/GP2/releases/gp2tier2/release5_11052023/cnvs/EUR/CNV_EUR_PERCENT_BAF_INSERTION_gp2ids.csv
# !ls /data/GP2/releases/gp2tier2/release5_11052023/cnvs/EUR
# !head /data/GP2/raw_genotypes/cnvs/release5/dosages/EUR/GP2_EUR_PERCENT_BAF_INSERTION.pheno
!head {cnv_path}/key/release5_covar.csv

,FID,GP2sampleID,sex_for_qc,age,age_of_onset
0,0,IMMUNEPD_000001_s1,1,61.0,
1,0,IMMUNEPD_000002_s1,1,66.0,
2,0,IMMUNEPD_000003_s1,2,55.0,
3,0,IMMUNEPD_000004_s1,1,50.0,
4,0,IMMUNEPD_000005_s1,2,74.0,
5,0,IMMUNEPD_000006_s1,1,71.0,
6,0,IMMUNEPD_000007_s1,2,63.0,
7,0,IMMUNEPD_000008_s1,1,60.0,
8,0,IMMUNEPD_000009_s1,1,74.0,


In [82]:
cnv_out_path = '/data/GP2/raw_genotypes/cnvwas'
with open(f'{swarm_scripts_dir}/cnvwas.swarm', 'w') as f:
    for label in ancestry_labels:
        for cnv_type in cnv_types:

            dosagefile = f'/data/GP2/releases/gp2tier2/release5_11052023/cnvs/{label}/CNV_{label}_{cnv_type}_gp2ids.csv'
            pheno = f'{cnv_path}/dosages/{label}/GP2_{label}_{cnv_type}.pheno'
            covar = f'{cnv_path}/key/release5_covar.csv'
            out_path = f'{cnv_out_path}/GP2_{label}_{cnv_type}_cnvwas.tab'

            cmd = f'\
python run_cnvwas_pipeline.py \
--cnv_dosage_file {dosagefile} \
--out_path {out_path} \
--pheno {pheno} \
--covar {covar}'

            f.write(f'{cmd}\n')
f.close()


In [99]:
!swarm -f {swarm_scripts_dir}/cnvwas.swarm -g 32 -t 16 --time=00:30:00 --logdir {swarm_scripts_dir}/logs --gres=lscratch:20 --partition=norm

65078390


In [104]:
!cat {swarm_scripts_dir}/logs/swarm_65078390_29.e




Traceback (most recent call last):
  File "run_cnvwas_pipeline.py", line 17, in <module>
    CNV_WAS(cnv_dosage_file, pheno, covar, out_path)
  File "/gpfs/gsfs12/users/vitaled2/CNVs/CNV/cnv.py", line 503, in CNV_WAS
    fitted = sm.formula.glm(formula=formula, family=sm.families.Binomial(), data=data_df_final).fit()
  File "/usr/local/Anaconda/envs/py3.8/lib/python3.8/site-packages/statsmodels/genmod/generalized_linear_model.py", line 1075, in fit
    return self._fit_irls(start_params=start_params, maxiter=maxiter,
  File "/usr/local/Anaconda/envs/py3.8/lib/python3.8/site-packages/statsmodels/genmod/generalized_linear_model.py", line 1224, in _fit_irls
    raise PerfectSeparationError(msg)
statsmodels.tools.sm_exceptions.PerfectSeparationError: Perfect separation detected, results not available


In [86]:
from GWAS.gwas import calculate_inflation
label = 'EUR'
# cnv_type = 'PERCENT_BAF_INSERTION'
# cnv_type = 'PERCENT_L2R_DUPLICATION'
# chrom = '1'
pheno_df = pd.read_csv(f'{cnv_path}/GP2_EUR_{cnv_type}.pheno', sep='\t', engine='python')
ncontrols = pheno_df.pheno.value_counts()[0]
ncases = pheno_df.pheno.value_counts()[1]
cnv_lambda_dict = dict()
for cnv_type in cnv_types:
    total_cnvtype_df = pd.DataFrame()
    for chrom in chroms:
        tab = f'{cnv_out_path}/GP2_{label}_{cnv_type}_cnvwas.tab'

        if os.path.isfile(tab):

            sumstats = pd.read_csv(tab, sep='\s+', engine='python')
            # print(tab)
            # infl = calculate_inflation(sumstats.P_VAL, normalize=True, ncases=ncases, ncontrols=ncontrols)
            # print(infl)
            # print()
            # print()
            total_cnvtype_df = pd.concat([total_cnvtype_df, sumstats], ignore_index=True)
        else:
            print(f'{tab} does not exist!')

    infl = calculate_inflation(total_cnvtype_df.P_VAL, normalize=True, ncases=ncases, ncontrols=ncontrols)
    cnv_lambda_dict[cnv_type] = infl
    




RUNNING: lambda_calculation


RUNNING: lambda_calculation


RUNNING: lambda_calculation



In [91]:
cnv_lambda_dict

{'PERCENT_BAF_INSERTION': {'pass': True,
  'step': 'lambda_calculation',
  'metrics': {'inflation': 2.101750589371727}},
 'PERCENT_L2R_DELETION': {'pass': True,
  'step': 'lambda_calculation',
  'metrics': {'inflation': 14.555569305672298}},
 'PERCENT_L2R_DUPLICATION': {'pass': True,
  'step': 'lambda_calculation',
  'metrics': {'inflation': 1.3423052908502526}}}

In [82]:

pheno = pd.read_csv(f'{cnv_path}/GP2_EUR_chr1_{cnv_type}.pheno', sep='\t', engine='python')
ncontrols = pheno.pheno.value_counts()[0]
ncases = pheno.pheno.value_counts()[1]

infl = calculate_inflation(total_cnvtype_df.P_VAL, normalize=True, ncases=ncases, ncontrols=ncontrols)
infl


RUNNING: lambda_calculation



{'pass': True,
 'step': 'lambda_calculation',
 'metrics': {'inflation': 1.3423052908502526}}

In [33]:
key

Unnamed: 0,GP2ID,GP2sampleID,manifest_id,phenotype,pheno_for_qc,other_pheno,sex_for_qc,age,age_of_onset,age_at_diagnosis,...,study,pruned,pruned_reason,label,related,SentrixBarcode_A,SentrixPosition_A,filename,IID,FID
0,IMMUNEPD_000001,IMMUNEPD_000001_s1,m1,PD,2,PD,1,61.0,,61.0,...,IMMUNEPD,0,,EUR,,206966380021,R01C01,206966380021_R01C01,206966380021_R01C01,0
1,IMMUNEPD_000002,IMMUNEPD_000002_s1,m1,PD,2,PD,1,66.0,,66.0,...,IMMUNEPD,0,,EUR,,206966380021,R02C01,206966380021_R02C01,206966380021_R02C01,0
2,IMMUNEPD_000003,IMMUNEPD_000003_s1,m1,Control,1,Control,2,55.0,,,...,IMMUNEPD,0,,EUR,,206966380021,R03C01,206966380021_R03C01,206966380021_R03C01,0
3,IMMUNEPD_000004,IMMUNEPD_000004_s1,m1,Control,1,Control,1,50.0,,,...,IMMUNEPD,0,,EUR,,206966380021,R04C01,206966380021_R04C01,206966380021_R04C01,0
4,IMMUNEPD_000005,IMMUNEPD_000005_s1,m1,Control,1,Control,2,74.0,,,...,IMMUNEPD,0,,EUR,,206966380021,R05C01,206966380021_R05C01,206966380021_R05C01,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26723,MDGAP-EBB_000149,MDGAP-EBB_000149_s1,m4,Control,1,Control,1,,,,...,MDGAP-EBB,1,callrate_prune,,,206412050061,R04C01,206412050061_R04C01,206412050061_R04C01,0
26724,MDGAP-EBB_000150,MDGAP-EBB_000150_s1,m4,Control,1,Control,2,,,,...,MDGAP-EBB,1,callrate_prune,,,206412050061,R05C01,206412050061_R05C01,206412050061_R05C01,0
26725,MDGAP-EBB_000151,MDGAP-EBB_000151_s1,m4,Control,1,Control,1,,,,...,MDGAP-EBB,1,callrate_prune,,,206412050061,R06C01,206412050061_R06C01,206412050061_R06C01,0
26726,MDGAP-EBB_000152,MDGAP-EBB_000152_s1,m4,Control,1,Control,2,,,,...,MDGAP-EBB,1,callrate_prune,,,206412050061,R07C01,206412050061_R07C01,206412050061_R07C01,0


## split out chromosome 4

In [5]:
import dask.dataframe as dd
import shutil
project_dir = '/data/GP2/projects/2023_06_14_DV_snp_metrics_extract/chr_4_extract'
# !mkdir {project_dir}

# !ls {snp_metrics_path}/204620380001/snp_metrics_204620380001.parquet/Sample_ID=204620380001_R01C01

In [38]:
with open(f'{swarm_scripts_dir}/rewrite_parquets.swarm', 'w') as f:
    for code in key.SentrixBarcode_A:

        sample_parquet = f'{snp_metrics_path}/{code}/snp_metrics_{code}.parquet'
        sample_parquet_out = f'{snp_metrics_path}/{code}/snp_metrics_{code}'

        if os.path.isdir(sample_parquet):
            cmd = f'python3 rewrite_parquets.py {sample_parquet} {sample_parquet_out}'
            
            f.write(f'{cmd}\n')
f.close()

In [51]:
!swarm -f {swarm_scripts_dir}/rewrite_parquets.swarm -g 8 -t 8 --time=00:15:00 --logdir {swarm_scripts_dir}/logs --gres=lscratch:20 --partition=norm

2932446


In [95]:


labels = list(key.label.unique())
labels.remove(np.nan)
for label in key.label.unique():
    # os.makedirs(f'{project_dir}/{label}', exist_ok=True)
    label_key = key.loc[key.label==label].copy()
    for sample in label_key.filename:
        code = sample.split('_')[0]
        parquet = f'{snp_metrics_path}/{code}/snp_metrics_{code}/Sample_ID={sample}/chromosome=4'
        if os.path.isdir(parquet):
            shutil.copytree(parquet, f'{project_dir}/{label}/Sample_ID={sample}/chromosome=4')
            

In [6]:
with open(f'{swarm_scripts_dir}/cp_chr4_gcp.swarm', 'w') as f:
    for label in key.label.unique():
    # os.makedirs(f'{project_dir}/{label}', exist_ok=True)
        label_key = key.loc[key.label==label].copy()
        for sample in label_key.filename:
            code = sample.split('_')[0]
            parquet = f'{project_dir}/{label}/Sample_ID={sample}/chromosome=4'
            if os.path.isdir(parquet):
                cmd = f'cd {project_dir}/{label}; gsutil -m cp -r Sample_ID={sample}/ gs://snp_metrics_sample/{label}/'
                
                f.write(f'{cmd}\n')
f.close()

In [10]:
!swarm -f {swarm_scripts_dir}/cp_chr4_gcp.swarm -g 8 -t 8 --time=00:15:00 --logdir {swarm_scripts_dir}/logs --module google-cloud-sdk --gres=lscratch:20 --partition=norm

3119163


In [9]:
# !cat {swarm_scripts_dir}/cp_chr4_gcp.swarm | wc -l
!head {swarm_scripts_dir}/cp_chr4_gcp.swarm

cd /data/GP2/projects/2023_06_14_DV_snp_metrics_extract/chr_4_extract/EUR; gsutil -m cp -r Sample_ID=206966380021_R01C01/ gs://snp_metrics_sample/EUR/
cd /data/GP2/projects/2023_06_14_DV_snp_metrics_extract/chr_4_extract/EUR; gsutil -m cp -r Sample_ID=206966380021_R02C01/ gs://snp_metrics_sample/EUR/
cd /data/GP2/projects/2023_06_14_DV_snp_metrics_extract/chr_4_extract/EUR; gsutil -m cp -r Sample_ID=206966380021_R03C01/ gs://snp_metrics_sample/EUR/
cd /data/GP2/projects/2023_06_14_DV_snp_metrics_extract/chr_4_extract/EUR; gsutil -m cp -r Sample_ID=206966380021_R04C01/ gs://snp_metrics_sample/EUR/
cd /data/GP2/projects/2023_06_14_DV_snp_metrics_extract/chr_4_extract/EUR; gsutil -m cp -r Sample_ID=206966380021_R05C01/ gs://snp_metrics_sample/EUR/
cd /data/GP2/projects/2023_06_14_DV_snp_metrics_extract/chr_4_extract/EUR; gsutil -m cp -r Sample_ID=206966380021_R06C01/ gs://snp_metrics_sample/EUR/
cd /data/GP2/projects/2023_06_14_DV_snp_metrics_extract/chr_4_extract/EUR; gsutil -m cp -r Sam

In [84]:
!ls -lh /data/GP2/projects/2023_06_14_DV_snp_metrics_extract/chr_4_extract/EUR/Sample_ID=206978580059_R05C01/chromosome=4

total 6.9M
-rw-r----- 1 vitaled2 vitaled2 3.7M Jun 14 14:32 part.97.parquet
-rw-r----- 1 vitaled2 vitaled2 3.1M Jun 14 14:32 part.98.parquet


In [120]:
!module load google-cloud-sdk; gsutil cp {key_file} gs://snp_metrics_sample/

[-] Unloading python 3.8  ... 
[+] Loading python 3.7  ... 
[+] Loading google-cloud-sdk  397.0.0 

The following have been reloaded with a version change:
  1) python/3.8 => python/3.7

Copying file:///data/GP2/clinical/master_key/GP2_master_key_FINAL_release5_n26728.txt [Content-Type=text/plain]...
- [1 files][  3.5 MiB/  3.5 MiB]                                                
Operation completed over 1 objects/3.5 MiB.                                      
