In [1]:
import pandas as pd
import os
import shutil
import numpy as np
import glob
from sklearn.preprocessing import MinMaxScaler
import statsmodels.api as sm

In [13]:
idat_path = '/data/CARD/PD/GP2/raw_genotypes/GP2_idats'
swarm_scripts_dir = f'/data/CARD/PD/GP2/swarm_scripts'

ilmn_files_path = '/data/CARD/PD/GP2/ilmn_files'
# A1 is hg19, A2 is hg38. csv needed for indel calls
bpm_csv = f'{ilmn_files_path}/NeuroBooster_20042459_A2.csv' 
bpm = f'{ilmn_files_path}/NeuroBooster_20042459_A2.bpm'
egt = f'{ilmn_files_path}/recluster_09272022.egt'
iaap = f'{ilmn_files_path}/iaap-cli/iaap-cli'

ref_fasta = '/data/vitaled2/ref/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna'

chroms = [str(i) for i in range(1,23)] + ['X','Y','M']


key_file = '/data/CARD/PD/GP2/clinical/key_merge/GP2_merge/GP2_master_key_full.txt'
key = pd.read_csv(f'{key_file}', sep='\t')
key.loc[:,'filename'] = key.loc[:,'SentrixBarcode_A'].astype(str) + '_' + key.loc[:,'SentrixPosition_A'].astype(str)

cohorts = [
    'SYNAPS-KZ', 
    'CORIELL', 
    'IPDGCEA-JU', 
    'KOC', 
    'IPDGCAF-NG', 
    'IPDGCEA-MU', 
    'PAGE', 
    'NZP3', 
    'S4', 
    'BCM',
    'BLAACPD-RUSH', 
    'BIOFIND', 
    'UMD', 
    'BLAACPD-UAB', 
    'LCC', 
    'SYNAPS-GR', 
    'MDGAP-QSBB', 
    'APGS', 
    'MDGAP-KINGS', 
    'MDGAP-IBB', 
    'BLAACPD-UC', 
    'BLAACPD-KPM', 
    'PPMI', 
    'IPDGCEA-UKM'
]

release_key = key.loc[key['study'].isin(cohorts)]
release_key.to_csv('/data/CARD/PD/GP2/clinical/key_merge/GP2_merge/release3_key.csv', index=False, header=True)

samples_list = release_key.loc[:,'filename'].unique()
barcodes_list = list(set([x.split('_')[0] for x in samples_list]))

genes = pd.read_csv('/data/CARD/PD/GP2/ref_panel/glist-hg38', sep='\s+', header=None, names=['chr','start','end','symbol'], dtype={'chr':str,'start':int,'end':int})
genes.columns = ['CHR','START','STOP','NAME']
genes_out = genes.loc[genes.CHR.isin(chroms)]
genes_out[['NAME','CHR','START','STOP']].to_csv('/data/CARD/PD/GP2/ref_panel/glist_hg38_intervals.csv', index=False)
# gene_list originally from plink glist
gene_list = '/data/CARD/PD/GP2/ref_panel/glist_hg38_intervals.csv'

raw_geno_path = '/data/CARD/PD/GP2/raw_genotypes'
snp_metrics_path = f'{raw_geno_path}/GP2_snp_metrics'
idat_path = f'{raw_geno_path}/GP2_idats'
plink_file_path = f'{raw_geno_path}/GP2_plink'

  key = pd.read_csv(f'{key_file}', sep='\t')


In [3]:
# len(samples_list)
# sorted(cohorts)
len(release_key)

21895

In [None]:
# get snp metrics
bcftools_plugins_path = 'bin'
with open(f'{swarm_scripts_dir}/snp_metrics.swarm', 'w') as f:
    for code in barcodes_list:
        idat_path_ = f'{idat_path}/{code}'
        metrics_out = f'{snp_metrics_path}'

        cmd = f'\
python run_snp_metrics_pipeline.py \
--idat_path {idat_path_} \
--bpm {bpm} \
--bpm_csv {bpm_csv} \
--egt {egt} \
--ref_fasta {ref_fasta} \
--out_path {metrics_out} \
--iaap {iaap} \
--bcftools_plugins_path {bcftools_plugins_path}'
        f.write(f'{cmd}\n')
f.close()


In [None]:
!swarm -f {swarm_scripts_dir}/snp_metrics.swarm -g 16 -t 16 --time=04:00:00 --logdir {swarm_scripts_dir}/logs --gres=lscratch:20 --partition=norm

In [41]:
# check if snp metrics produced across all samples
completed_list = []
missing_list = []
for sample in samples_list:
    code = sample.split('_')[0]
    metrics_path = f'{snp_metrics_path}/{code}'
    for chrom in chroms:
        mfile = f'{metrics_path}/snp_metrics_{sample}_chr{chrom}.csv'
        if os.path.isfile(mfile):
            completed_list.append(sample)
        else:
            missing_list.append(sample)
completed_final = list(set(completed_list))
missing_final =list(set(missing_list))


In [42]:
print(len(completed_final))
print(len(missing_final))

21503
144


In [None]:
# now check if we have idats for missing snp metrics
missing_idats = []
exists_idats = []
for sample in missing_final:
    code = sample.split('_')[0]
    grn = f'{idat_path}/{code}/{sample}_Grn.idat'
    red = f'{idat_path}/{code}/{sample}_Red.idat'
    if os.path.isfile(grn) & os.path.isfile(red):
        exists_idats.append(sample)
    else:
        missing_idats.append(sample)

In [None]:
print(len(missing_idats))
print(len(exists_idats))

In [20]:
!ls {cnv_path}/*.bim

/data/CARD/PD/GP2/raw_genotypes/GP2_cnvs/release3/GP2_round3_NEW_MERGE_AAC.bim
/data/CARD/PD/GP2/raw_genotypes/GP2_cnvs/release3/GP2_round3_NEW_MERGE_AAC_clean.bim
/data/CARD/PD/GP2/raw_genotypes/GP2_cnvs/release3/GP2_round3_NEW_MERGE_AFR.bim
/data/CARD/PD/GP2/raw_genotypes/GP2_cnvs/release3/GP2_round3_NEW_MERGE_AFR_clean.bim
/data/CARD/PD/GP2/raw_genotypes/GP2_cnvs/release3/GP2_round3_NEW_MERGE_AJ.bim
/data/CARD/PD/GP2/raw_genotypes/GP2_cnvs/release3/GP2_round3_NEW_MERGE_AJ_clean.bim
/data/CARD/PD/GP2/raw_genotypes/GP2_cnvs/release3/GP2_round3_NEW_MERGE_AMR.bim
/data/CARD/PD/GP2/raw_genotypes/GP2_cnvs/release3/GP2_round3_NEW_MERGE_AMR_clean.bim
/data/CARD/PD/GP2/raw_genotypes/GP2_cnvs/release3/GP2_round3_NEW_MERGE_CAS.bim
/data/CARD/PD/GP2/raw_genotypes/GP2_cnvs/release3/GP2_round3_NEW_MERGE_CAS_clean.bim
/data/CARD/PD/GP2/raw_genotypes/GP2_cnvs/release3/GP2_round3_NEW_MERGE_EAS.bim
/data/CARD/PD/GP2/raw_genotypes/GP2_cnvs/release3/GP2_round3_NEW_MERGE_EAS_clean.bim
/data/CARD/PD/GP2/

In [14]:
cnv_path = f'/data/CARD/PD/GP2/raw_genotypes/GP2_cnvs/release3'
# using just autosomes for CNV analysis due to call quality
chroms = [str(i) for i in range(1,23)]

ancestry_labels = [x.split('/')[-1].replace('.bed','').split('_')[-1] for x in glob.glob('/data/CARD/PD/GP2/genotypes/GP2/round3/clean/GP2_round3_NEW_MERGE*.bed') if '_maf_hwe' not in x]

release_key[['FID','GP2sampleID']].to_csv(f'{cnv_path}/release3.samples', sep='\t', header=False, index=False)
release_key[['GP2sampleID','IID']].to_csv(f'{cnv_path}/release3_sample_id_key.csv')
release_covars = release_key.loc[:,['FID', 'GP2sampleID','sex_for_qc', 'age', 'age_of_onset']]

cnv_types = ['PERCENT_BAF_INSERTION','PERCENT_L2R_DELETION','PERCENT_L2R_DUPLICATION']

In [None]:
# snp QC for cnvs
for label in ancestry_labels:
    out_name = f'GP2_round3_NEW_MERGE_{label}'
    out_path = f'{cnv_path}/{out_name}'
    geno_path = f'/data/CARD/PD/GP2/genotypes/GP2/round3/clean/{out_name}'

    cmd1 = f'\
module load plink/1.9; plink \
--bfile {geno_path} \
--keep {cnv_path}/release3.samples \
--make-bed \
--out {out_path}'

    cmd2 = f'\
module load plink/1.9; plink \
--bfile {out_path} \
--maf 0.01 \
--geno 0.02 \
--hwe 5e-6 \
--autosome \
--make-bed \
--out {out_path}_clean'

    cmd3 = f'\
module load plink/1.9; plink \
--bfile {out_path}_clean \
--pca \
--out {out_path}_clean_pcs'

    cmds = [cmd1, cmd2, cmd3]

    for cmd in cmds:
        !{cmd}

    pcs = pd.read_csv(f'{out_path}_clean_pcs.eigenvec', sep='\s+')
    pc_num = pcs.iloc[:, 2:].shape[1]
    pc_names = ['FID','GP2sampleID'] + [f'PC{i}' for i in range(1, pc_num+1)]
    pcs.columns = pc_names

    cov = pcs.merge(release_covars, on=['FID','GP2sampleID'], how='left')
    cov.age.fillna(cov.age.mean(), inplace=True)
    cov.age_of_onset.fillna(cov.age_of_onset.mean(), inplace=True)
    cov.sex_for_qc.fillna(cov.sex_for_qc.median(), inplace=True)
    cov.rename(columns={'GP2sampleID':'sampleid','sex_for_qc':'sex'}, inplace=True)
    cov.to_csv(f'{out_path}_clean.cov', sep='\t', header=True, index=False)

    samples = cov.merge(release_key[['GP2sampleID','IID']], left_on='sampleid', right_on='GP2sampleID', how='left')
    samples['IID'].to_csv(f'{out_path}_clean_barcode.samples', header=False, index=False)

    bim = pd.read_csv(f'{out_path}_clean.bim', sep='\s+', header=None, names=['chr','id','pos','bp','a1','a2'], usecols=['id'])
    bim.to_csv(f'{out_path}_clean.snps', sep='\t', header=False, index=False)
        

        
# skip ld pruning until we have better quality rare calls
# --exclude exclusion_regions_hg38.txt \


#         cmd3 = f'\
# plink \
# --bfile {geno}_release2_maf_geno_hwe \
# --indep-pairwise 1000 10 0.02 \
# --autosome \
# --out {geno}_release2_prune'
    
#         cmd4 = f'\
# plink \
# --bfile {geno}_release2_maf_geno_hwe \
# --extract {geno}_release2_prune.prune.in \
# --make-bed \
# --out {geno}_release2_clean'

In [48]:
for label in ancestry_labels:
    out_name = f'GP2_round3_NEW_MERGE_{label}'
    out_path = f'{cnv_path}/{out_name}'
    print(label)
    !cat {out_path}_clean.cov | wc -l
    print()

AMR
419

EUR
12052

SAS
85

MDE
81

EAS
1394

AAC
1258

AFR
1423

CAS
310

AJ
1302



# some testing for cnv caller

In [51]:
from numpy.core.numeric import NaN
sample = '204958250147_R03C01'
chrom = '1'
code = sample.split('_')[0]
label = 'EUR'


snp_metrics_file = f'{snp_metrics_path}/{code}/snp_metrics_{sample}_chr{chrom}.csv'
cnv_out = f'{cnv_path}/CNV_{label}_{sample}_chr{chrom}.csv'
intervals_file = f'/data/CARD/PD/GP2/ref_panel/glist_hg38_chr{chrom}.csv'
min_variants = 10
kb_window = 250
bim_path = '/data/CARD/PD/GP2/raw_genotypes/GP2_cnvs/release3/GP2_round3_NEW_MERGE_EUR_clean.bim'


# Load in the data.
metrics_df = pd.read_csv(snp_metrics_file, engine='c')
bim = pd.read_csv(bim_path, sep='\s+', header=None, names=['chr','id','pos','bp','a1','a2'], usecols=['id'])
sample_df = metrics_df.loc[metrics_df.snpID.isin(bim.id)]

temp_interval_df = pd.read_csv(intervals_file, engine='c')
temp_interval_df.drop_duplicates(subset = ["NAME"], inplace=True, keep='first')
intervals_df = temp_interval_df.copy()


# # Now reduce just to the intervals of interest and summarize each interval.

# # Break down L2R and BAF per gene.

results = []

# interval_list = intervals_df['NAME'].unique()
interval_list = ['A3GALT2']

for INTERVAL in interval_list:
    print(intervals_df.loc[intervals_df['NAME']==INTERVAL])

    interval_CHR = intervals_df.loc[intervals_df['NAME'] == INTERVAL, 'CHR'].item()
    interval_START_gene = intervals_df.loc[intervals_df['NAME'] == INTERVAL, 'START'].item()
    interval_STOP_gene = intervals_df.loc[intervals_df['NAME'] == INTERVAL, 'STOP'].item()
    interval_START = interval_START_gene - (kb_window*1000)
    interval_STOP = interval_STOP_gene + (kb_window*1000)

    temp_df = sample_df.loc[(sample_df['chromosome'] == interval_CHR) & (sample_df['position'] >= interval_START) & (sample_df['position'] <= interval_STOP)]

    if temp_df.shape[0] < min_variants:

        results.append((INTERVAL, temp_df.shape[0], NaN, NaN, NaN, interval_START, interval_START_gene, interval_STOP_gene, interval_STOP))
    else:
        temp_df['BAF_insertion'] = np.where( (temp_df['BAlleleFreq'].between(0.65, 0.85, inclusive='neither')) | (temp_df['BAlleleFreq'].between(0.15, 0.35, inclusive='neither')), 1, 0)
        temp_df['L2R_deletion'] = np.where( temp_df['LogRRatio'] < -0.2, 1, 0)
        temp_df['L2R_insertion'] = np.where( temp_df['LogRRatio'] > 0.2, 1, 0)
        PERCENT_BAF_INSERTION = temp_df['BAF_insertion'].mean()
        PERCENT_L2R_DELETION = temp_df['L2R_deletion'].mean()
        PERCENT_L2R_INSERTION = temp_df['L2R_insertion'].mean()
        results.append((INTERVAL, temp_df.shape[0], PERCENT_BAF_INSERTION, PERCENT_L2R_DELETION, PERCENT_L2R_INSERTION, interval_START, interval_START_gene, interval_STOP_gene, interval_STOP))
    
    output = pd.DataFrame(results, columns=('INTERVAL', 'NUM_VARIANTS', 'PERCENT_BAF_INSERTION', 'PERCENT_L2R_DELETION','PERCENT_L2R_DUPLICATION','START_PLUS_WINDOW','START','STOP','STOP_PLUS_WINDOW'))

   CHR     START      STOP     NAME
0    1  33306765  33321098  A3GALT2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['BAF_insertion'] = np.where( (temp_df['BAlleleFreq'].between(0.65, 0.85, inclusive='neither')) | (temp_df['BAlleleFreq'].between(0.15, 0.35, inclusive='neither')), 1, 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['L2R_deletion'] = np.where( temp_df['LogRRatio'] < -0.2, 1, 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/ind

In [52]:
temp_df.shape[0]

111

In [62]:
21895*22*3

1445070

In [36]:
len(metrics_df.snpID.unique())/2580

60.97015503875969

In [44]:
sample_df

Unnamed: 0,chromosome,position,snpID,Sample_ID,Ref,Alt,ALLELE_A,ALLELE_B,BAlleleFreq,LogRRatio,R,Theta,GenTrain_Score,GType,Alt1,Alt2,GT,a1,a2,maf
0,1,49554,1:49554-G-A,204958250147_R03C01,A,G,0,1,0.033875,-0.016783,1.691960,0.050030,0.699477,AA,G,,AA,A,G,0.142857
21,1,826578,JHU_1.761957,204958250147_R03C01,C,T,1,0,1.001940,-0.111973,1.155930,0.962306,0.832417,AA,T,,BB,T,C,0.000000
27,1,833439,rs12562811,204958250147_R03C01,C,T,1,0,0.490758,0.097902,0.764211,0.708393,0.443799,AB,T,,AB,T,C,0.214286
32,1,839745,JHU_1.775124,204958250147_R03C01,C,T,1,0,1.005320,-0.112962,1.098080,0.957153,0.790478,AA,T,,BB,T,C,0.083333
40,1,858952,JHU_1.794331,204958250147_R03C01,G,A,1,0,0.515637,-0.068786,1.557300,0.622296,0.540952,AB,A,,AB,A,G,0.357143
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157272,1,248908807,JHU_1.249203005,204958250147_R03C01,G,A,1,0,0.985136,-0.042008,0.977678,0.923677,0.832624,AA,A,,BB,A,G,0.142857
157275,1,248910813,JHU_1.249205011,204958250147_R03C01,C,G,1,0,0.997110,-0.111099,1.261540,0.977864,0.497399,AA,G,,BB,G,C,0.142857
157276,1,248914573,JHU_1.249208771,204958250147_R03C01,C,T,1,0,0.984007,0.041096,1.420910,0.949008,0.755758,AA,T,,BB,T,C,0.000000
157277,1,248916508,rs6704311,204958250147_R03C01,G,A,1,0,0.991263,-0.027500,1.276190,0.963003,0.777869,AA,A,,BB,A,G,0.071429


In [67]:

with open(f'{swarm_scripts_dir}/cnvs.swarm', 'w') as f:
    for label in ancestry_labels:
        out_name = f'GP2_round3_NEW_MERGE_{label}'
        geno_path = f'{cnv_path}/{out_name}_clean'
        bim = f'{geno_path}.bim'
        samples = pd.read_csv(f'{geno_path}_barcode.samples', header=None, names=['IID'])
        
        for sample in samples.IID.unique():
            code = sample.split('_')[0]
            for chrom in chroms:
            
                mfile = f'{snp_metrics_path}/{code}/snp_metrics_{sample}_chr{chrom}.csv'

                cnv_out = f'{cnv_path}/CNV_{label}_{sample}_chr{chrom}.csv'
                intervals = f'/data/CARD/PD/GP2/ref_panel/glist_hg38_chr{chrom}.csv'

                cmd = f'\
python run_cnv_pipeline.py \
--metrics {mfile} \
--bim {bim} \
--out_path {cnv_out} \
--intervals {intervals} \
--min_variants 10 \
--kb_window 250'
            
                f.write(f'{cmd}\n')
f.close()

In [77]:
!swarm -f {swarm_scripts_dir}/cnvs.swarm -g 4 -t 8 --time=00:30:00 --logdir {swarm_scripts_dir}/logs --module python/3.8 --gres=lscratch:20 --partition=norm

52322078


In [74]:
!python3 run_cnv_pipeline.py --metrics /data/CARD/PD/GP2/raw_genotypes/GP2_snp_metrics/204958250147/snp_metrics_204958250147_R03C01_chr1.csv --bim /data/CARD/PD/GP2/raw_genotypes/GP2_cnvs/release3/GP2_round3_NEW_MERGE_EUR_clean.bim --out_path /data/CARD/PD/GP2/raw_genotypes/GP2_cnvs/release3/CNV_EUR_204958250147_R03C01_chr1.csv --intervals /data/CARD/PD/GP2/ref_panel/glist_hg38_chr1.csv --min_variants 10 --kb_window 250


Plink is found
Plink2 is found


In [23]:
# write dosages per chromosome per ancestry per cnv-type

with open(f'{swarm_scripts_dir}/cnv_dosages.swarm', 'w') as f:
    for label in ancestry_labels:
        for chrom in chroms:

            cnv_files_list = glob.glob(f'{cnv_path}/CNV_{label}_*_chr{chrom}.csv')
            cnv_files_df = pd.DataFrame({'filename':cnv_files_list})
            cnv_files_df.to_csv(f'{cnv_path}/CNV_{label}_chr{chrom}_files.csv', header=False, index=False)

            for cnv_type in cnv_types:
                
                cmd = f'\
python run_cnv_dosage_pipeline.py \
--files {cnv_path}/CNV_{label}_chr{chrom}_files.csv \
--label {label} \
--chrom {chrom} \
--cnv_type {cnv_type} \
--out_path {cnv_path}/CNV_{label}_chr{chrom}_{cnv_type}.csv'
                f.write(f'{cmd}\n')
f.close()
            


In [24]:
!swarm -f {swarm_scripts_dir}/cnv_dosages.swarm -g 32 -t 16 --logdir {swarm_scripts_dir}/logs --gres=lscratch:20 --partition=norm

50839144


In [10]:
!head {cnv_path}/CNV_EUR_chr1_files.csv

/data/CARD/PD/GP2/raw_genotypes/GP2_cnvs/release3/CNV_EUR_204958250147_R03C01_chr1.csv
/data/CARD/PD/GP2/raw_genotypes/GP2_cnvs/release3/CNV_EUR_206391680116_R04C01_chr1.csv
/data/CARD/PD/GP2/raw_genotypes/GP2_cnvs/release3/CNV_EUR_206046190127_R01C01_chr1.csv
/data/CARD/PD/GP2/raw_genotypes/GP2_cnvs/release3/CNV_EUR_206046190102_R06C01_chr1.csv
/data/CARD/PD/GP2/raw_genotypes/GP2_cnvs/release3/CNV_EUR_205814910049_R02C01_chr1.csv
/data/CARD/PD/GP2/raw_genotypes/GP2_cnvs/release3/CNV_EUR_206046190068_R02C01_chr1.csv
/data/CARD/PD/GP2/raw_genotypes/GP2_cnvs/release3/CNV_EUR_204835450146_R04C01_chr1.csv
/data/CARD/PD/GP2/raw_genotypes/GP2_cnvs/release3/CNV_EUR_205053530113_R03C01_chr1.csv
/data/CARD/PD/GP2/raw_genotypes/GP2_cnvs/release3/CNV_EUR_206430130072_R04C01_chr1.csv
/data/CARD/PD/GP2/raw_genotypes/GP2_cnvs/release3/CNV_EUR_206451070033_R04C01_chr1.csv


In [19]:
!ls /data/CARD/PD/GP2/raw_genotypes/GP2_plink/*204958250147_R03C01*

/data/CARD/PD/GP2/raw_genotypes/GP2_plink/204958250147_R03C01.bed
/data/CARD/PD/GP2/raw_genotypes/GP2_plink/204958250147_R03C01.bim
/data/CARD/PD/GP2/raw_genotypes/GP2_plink/204958250147_R03C01.fam
/data/CARD/PD/GP2/raw_genotypes/GP2_plink/204958250147_R03C01.hh
/data/CARD/PD/GP2/raw_genotypes/GP2_plink/204958250147_R03C01.log
/data/CARD/PD/GP2/raw_genotypes/GP2_plink/204958250147_R03C01.map
/data/CARD/PD/GP2/raw_genotypes/GP2_plink/204958250147_R03C01.ped


In [30]:
# cnv_dosage = pd.read_csv('/data/CARD/PD/GP2/raw_genotypes/GP2_cnvs/release3/CNV_EUR_204958250147_R03C01_chr1.csv')
# cnv_dosage[cnv_dosage.NUM_VARIANTS>250]
cnv_dosage.NUM_VARIANTS.std()
cnv_dosage.NUM_VARIANTS.mean()

448.7984496124031

In [49]:
cnv_dosage

Unnamed: 0,INTERVAL,NUM_VARIANTS,PERCENT_BAF_INSERTION,PERCENT_L2R_DELETION,PERCENT_L2R_DUPLICATION,START_PLUS_WINDOW,START,STOP,STOP_PLUS_WINDOW
0,A3GALT2,357,0.005602,0.039216,0.075630,33056765,33306765,33321098,33571098
1,AADACL3,351,0.002849,0.034188,0.079772,12466114,12716114,12728759,12978759
2,AADACL4,376,0.002660,0.031915,0.077128,12394546,12644546,12667086,12917086
3,ABCA4,944,0.002119,0.049788,0.056144,93742837,93992837,94121149,94371149
4,ABCB10,579,0.000000,0.022453,0.075993,229266581,229516581,229558695,229808695
...,...,...,...,...,...,...,...,...,...
2575,ZSCAN20,417,0.004796,0.023981,0.067146,33222630,33472630,33496395,33746395
2576,ZSWIM5,811,0.003699,0.019729,0.081381,44766403,45016403,45206578,45456578
2577,ZYG11A,363,0.002755,0.024793,0.093664,52592510,52842510,52894575,53144575
2578,ZYG11B,290,0.003448,0.020690,0.100000,52476458,52726458,52827341,53077341


In [None]:
cnv_dosage[cnv_dosage]

In [33]:
82609/2580

32.01899224806201

In [25]:
with open(f'{swarm_scripts_dir}/update_cnv_ids.swarm','w') as f:
    for label in ancestry_labels:
        for cnv_type in cnv_types:
            for chrom in chroms:

                dosagefile = f'{cnv_path}/CNV_{label}_chr{chrom}_{cnv_type}.csv'
                dosagefile_out = f'{cnv_path}/CNV_{label}_chr{chrom}_{cnv_type}_gp2ids.csv'
                pheno_out = f'{cnv_path}/GP2_{label}_chr{chrom}_{cnv_type}.pheno'
                cmd = f'\
python update_cnv_ids.py \
--dosagefile {dosagefile} \
--key /data/CARD/PD/GP2/clinical/key_merge/GP2_merge/release3_key.csv \
--pheno_out {pheno_out} \
--out_path {dosagefile_out}'
                f.write(f'{cmd}\n')
f.close()

In [26]:
!swarm -f {swarm_scripts_dir}/update_cnv_ids.swarm -g 8 -t 8 --logdir {swarm_scripts_dir}/logs --gres=lscratch:20 --partition=norm

50843587


In [27]:
cnv_out_path = '/data/CARD/PD/GP2/genotypes/GP2/round3/GP2_cnvs'
with open(f'{swarm_scripts_dir}/cnvwas.swarm', 'w') as f:
    for label in ancestry_labels:
        for cnv_type in cnv_types:
            for chrom in chroms:

                dosagefile = f'{cnv_path}/CNV_{label}_chr{chrom}_{cnv_type}_gp2ids.csv'
                pheno = f'{cnv_path}/GP2_{label}_chr{chrom}_{cnv_type}.pheno'
                covar = f'{cnv_path}/GP2_round3_NEW_MERGE_{label}_clean.cov'
                out_path = f'{cnv_out_path}/GP2_{label}_chr{chrom}_{cnv_type}_cnvwas.tab'

                cmd = f'\
python run_cnvwas_pipeline.py \
--cnv_dosage_file {dosagefile} \
--out_path {out_path} \
--pheno {pheno} \
--covar {covar}'
            
                f.write(f'{cmd}\n')
f.close()


In [28]:
!swarm -f {swarm_scripts_dir}/cnvwas.swarm -g 32 -t 16 --time=00:30:00 --logdir {swarm_scripts_dir}/logs --gres=lscratch:20 --partition=norm

50860098


In [5]:
# output final release samples
cnv_out_path = '/data/CARD/PD/GP2/genotypes/GP2/round3/GP2_cnvs'
release3_samples = '/data/CARD/PD/GP2/genotypes/GP2/round3/clean/release3.samples'
release_key_final = pd.read_csv('/data/CARD/PD/GP2/genotypes/GP2/round3/clean/release3_master_key.csv')
release_key_out = release_key_final.loc[release_key_final.pheno!=-9]
release_key_out['GP2sampleID'].to_csv(release3_samples, index=False, header=False)

In [143]:
with open(f'{swarm_scripts_dir}/cnv_dosage_release_split.swarm', 'w') as f:
    for label in ancestry_labels:
        sample_file = f'/data/CARD/PD/GP2/genotypes/GP2/round3/clean/{label}_release3.samples'
        label_samples = release_key_final.loc[release_key_final['label']==label]
        label_samples['GP2sampleID'].to_csv(sample_file, header=False, index=False)

        for cnv_type in cnv_types:
            for chrom in chroms:
                
                dosagefile = f'{cnv_path}/CNV_{label}_chr{chrom}_{cnv_type}_gp2ids.csv'
                dosage_out = f'{cnv_out_path}/CNV_{label}_chr{chrom}_{cnv_type}_release3.csv'
                
                cmd = f'\
python split_release_samples.py \
--cnv_dosage_file {dosagefile} \
--samples {sample_file} \
--out_path {dosage_out}'
            
                f.write(f'{cmd}\n')
f.close()


In [149]:
!swarm -f {swarm_scripts_dir}/cnv_dosage_release_split.swarm -g 8 -t 8 --time=00:30:00 --logdir {swarm_scripts_dir}/logs --gres=lscratch:20 --partition=norm

51651310


In [151]:
# CHECK COUNTS
total_dosages = 0
for label in ancestry_labels:
    for cnv_type in cnv_types:
        dosage_out = f'{cnv_out_path}/CNV_{label}_chr1_{cnv_type}_release3.csv'
        print(label, cnv_type)
        label_count = !cat {dosage_out} | wc -l
        label_count = int(label_count[0])
        total_dosages += label_count
        print(label_count)
        print()

AMR PERCENT_BAF_INSERTION
412

AMR PERCENT_L2R_DELETION
412

AMR PERCENT_L2R_DUPLICATION
412

EUR PERCENT_BAF_INSERTION
10727

EUR PERCENT_L2R_DELETION
10727

EUR PERCENT_L2R_DUPLICATION
10727

SAS PERCENT_BAF_INSERTION
59

SAS PERCENT_L2R_DELETION
59

SAS PERCENT_L2R_DUPLICATION
59

MDE PERCENT_BAF_INSERTION
62

MDE PERCENT_L2R_DELETION
62

MDE PERCENT_L2R_DUPLICATION
62

EAS PERCENT_BAF_INSERTION
137

EAS PERCENT_L2R_DELETION
137

EAS PERCENT_L2R_DUPLICATION
137

AAC PERCENT_BAF_INSERTION
1215

AAC PERCENT_L2R_DELETION
1215

AAC PERCENT_L2R_DUPLICATION
1215

AFR PERCENT_BAF_INSERTION
404

AFR PERCENT_L2R_DELETION
404

AFR PERCENT_L2R_DUPLICATION
404

CAS PERCENT_BAF_INSERTION
303

CAS PERCENT_L2R_DELETION
303

CAS PERCENT_L2R_DUPLICATION
303

AJ PERCENT_BAF_INSERTION
923

AJ PERCENT_L2R_DELETION
923

AJ PERCENT_L2R_DUPLICATION
923



In [21]:
# push to release bucket
with open(f'{swarm_scripts_dir}/gcp_push_cnvs.swarm','w') as f:
    
    for label in ancestry_labels:
        for chrom in chroms:
    
#         !module load google-cloud-sdk/397.0.0; gsutil cp {cnv_out_path}/CNV_{label}_BAF_INSERTION_release3.csv gs://gp2tier2/release3_31102022/cnvs/{label}/
            cp1 = f'gsutil cp {cnv_out_path}/CNV_{label}_chr{chrom}_PERCENT_BAF_INSERTION_release3.csv gs://gp2tier2/release3_31102022/cnvs/{label}/'
            cp2 = f'gsutil cp {cnv_out_path}/CNV_{label}_chr{chrom}_PERCENT_L2R_DELETION_release3.csv gs://gp2tier2/release3_31102022/cnvs/{label}/'
            cp3 = f'gsutil cp {cnv_out_path}/CNV_{label}_chr{chrom}_PERCENT_L2R_DUPLICATION_release3.csv gs://gp2tier2/release3_31102022/cnvs/{label}/'

            cps = [cp1, cp2, cp3]

            for cp in cps:
                f.write(f'{cp}\n')
f.close()

In [28]:
!swarm -f {swarm_scripts_dir}/gcp_push_cnvs.swarm -g 8 -t 8 --time=00:30:00 --logdir {swarm_scripts_dir}/logs --module google-cloud-sdk/397.0.0 --gres=lscratch:20 --partition=norm

51673863


In [24]:
!module load google-cloud-sdk/397.0.0; gsutil cp /data/CARD/PD/GP2/genotypes/GP2/round3/GP2_cnvs/CNV_AMR_chr1_BAF_INSERTION_release3.csv gs://gp2tier2/release3_31102022/cnvs/AMR/

594


27.0

In [5]:
dosage_df

Unnamed: 0,sampleid,A3GALT2,AADACL3,AADACL4,ABCA4,ABCB10,ABCD3,ABL2,ACADM,ACAP3,...,ZNHIT6,ZP4,ZRANB2,ZRANB2_AS1,ZRANB2_AS2,ZSCAN20,ZSWIM5,ZYG11A,ZYG11B,ZZZ3
0,CORIELL_000178_s1,0.005602,0.002849,0.002660,0.002119,0.000000,0.000000,0.000000,0.001992,0.003546,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.004796,0.003699,0.002755,0.003448,0.000000
1,PAGE_000432_s1,0.008403,0.008547,0.007979,0.003178,0.039724,0.007653,0.000000,0.001992,0.144208,...,0.003311,0.000000,0.014925,0.014634,0.011538,0.004796,0.018496,0.011019,0.013793,0.010526
2,CORIELL_000783_s1,0.002801,0.000000,0.000000,0.001059,0.001727,0.002551,0.000000,0.000000,0.004728,...,0.003311,0.000000,0.000000,0.000000,0.000000,0.002398,0.001233,0.000000,0.000000,0.003509
3,CORIELL_004856_s1,0.000000,0.002849,0.002660,0.001059,0.005181,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,CORIELL_007135_s1,0.002801,0.000000,0.000000,0.000000,0.003454,0.002551,0.000000,0.000000,0.003546,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.002466,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11690,PAGE_001054_s1,0.005602,0.008547,0.007979,0.003178,0.012090,0.007653,0.000000,0.000000,0.076832,...,0.000000,0.000000,0.000000,0.000000,0.007692,0.002398,0.008631,0.013774,0.013793,0.000000
11691,LCC_000146_s1,0.008403,0.002849,0.002660,0.001059,0.010363,0.002551,0.000000,0.000000,0.008274,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.004796,0.000000,0.000000,0.000000,0.000000
11692,CORIELL_002459_s1,0.002801,0.002849,0.002660,0.001059,0.008636,0.005102,0.000000,0.001992,0.004728,...,0.006623,0.001447,0.000000,0.000000,0.000000,0.002398,0.004932,0.002755,0.003448,0.014035
11693,BCM_000036_s1,0.002801,0.000000,0.000000,0.000000,0.000000,0.002551,0.000000,0.001992,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.001233,0.005510,0.006897,0.007018


# testing for cnvwas

In [3]:
# test = pd.read_csv('/data/CARD/PD/GP2/raw_genotypes/GP2_cnvs/release3/CNV_EUR_chr1_PERCENT_BAF_INSERTION_gp2ids.csv')
# test
cnv_dosage_file = '/data/CARD/PD/GP2/raw_genotypes/GP2_cnvs/release3/CNV_EUR_chr1_PERCENT_BAF_INSERTION_gp2ids.csv'
pheno = '/data/CARD/PD/GP2/raw_genotypes/GP2_cnvs/release3/GP2_EUR_chr1_PERCENT_BAF_INSERTION.pheno'
covar = '/data/CARD/PD/GP2/raw_genotypes/GP2_cnvs/release3/GP2_round3_NEW_MERGE_EUR_clean.cov'

scaler = MinMaxScaler()
dosage_df = pd.read_csv(cnv_dosage_file)
dosage_df.dropna(axis=1, inplace=True)
#fix column names
dosage_df.columns = [x.replace('-','_') for x in dosage_df.columns]
dosage_df.columns = [x.replace('.','_') for x in dosage_df.columns]
dosage_df.columns = [x.replace(' ','_') for x in dosage_df.columns]

pheno_df = pd.read_csv(pheno, sep='\t')
covar_df = pd.read_csv(covar, sep='\t')


if covar_df.age_of_onset.isna().all():
    covar_df.drop(columns=['age_of_onset'], inplace=True)
else:
    covar_df.loc[:,'age_of_onset'] = scaler.fit_transform(covar_df[['age_of_onset']])

if covar_df.age.isna().all():
    covar_df.drop(columns=['age'], inplace=True)
else:
    covar_df.loc[:,'age'] = scaler.fit_transform(covar_df[['age']])

if covar_df.sex.isna().all():
    covar_df.drop(columns=['sex'], inplace=True)

covar_df.drop(columns=['FID'], inplace=True)
covar_df.rename(columns={'GP2sampleID':'sampleid'}, inplace=True)

data_df = dosage_df.merge(covar_df, on='sampleid', how='left').merge(pheno_df, on='sampleid', how='left').set_index('sampleid')
data_df.dropna(axis=0, inplace=True)
rm_pred = [f'PC{i}' for i in range(1,21)] + ['sex','age_of_onset','age','pheno']

pred_list = [x for x in data_df.columns if x not in rm_pred]
covars_list = [x for x in data_df.columns if x not in pred_list + [f'PC{i}' for i in range(11,21)] + ['pheno']]

results = []
fails = []

for pred in range(len(pred_list)):
    pred_name = pred_list[pred]
    
    mean_ = data_df.loc[data_df['pheno']==0 ,pred_name].mean()
    std_ = data_df.loc[data_df['pheno']==0 ,pred_name].std()
    # mean_ = data_df.loc[: ,pred_name].mean()
    # std_ = data_df.loc[: ,pred_name].std()
    recoded_df = data_df.copy()
    recoded_df.loc[:, pred_name] = np.where((data_df[pred_name]>mean_+(2*std_)) | (data_df[pred_name]<mean_-(2*std_)), 1, 0)
#     recoded_df.loc[:, pred_name] = np.where((data_df[pred_name]>control_mean+(2*control_std)) | (data_df[pred_name]<control_mean-(2*control_std)), 1, 0)
    pred_ = [pred_name] + covars_list
    X = recoded_df.loc[:, pred_]
    y = recoded_df.loc[:,'pheno']

#     formula = "pheno ~ " + pred_name + " + " + ' + '.join(covars_list)
#     fitted = sm.formula.glm(formula=formula, family=sm.families.Binomial(), data=recoded_df).fit()
#     pred_ = [pred_name] + covars_list
#     X = data_df.loc[:, pred_]
#     y = data_df.loc[:,'pheno']
    fitted = sm.OLS(y, X).fit()
#     fitted = sm.formula.glm(formula=formula, family=sm.families.Binomial(), data=data_df).fit()
    beta_coef  = fitted.params.loc[pred_name]
    beta_se  = fitted.bse.loc[pred_name]
    p_val = fitted.pvalues.loc[pred_name]

    results.append((pred_name, beta_coef, beta_se, p_val))


output = pd.DataFrame(results, columns=('PREDICTOR', 'BETA_COEF', 'BETA_SE','P_VAL'))

In [4]:
data_df

Unnamed: 0_level_0,A3GALT2,AADACL3,AADACL4,ABCA4,ABCB10,ABCD3,ABL2,ACADM,ACAP3,ACBD3,...,PC15,PC16,PC17,PC18,PC19,PC20,sex,age,age_of_onset,pheno
sampleid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CORIELL_000178_s1,0.005602,0.002849,0.002660,0.002119,0.000000,0.000000,0.000000,0.001992,0.003546,0.000000,...,0.010497,-0.001786,-0.000480,-0.015263,0.022669,-0.001714,1.0,0.637500,0.582418,1
PAGE_000432_s1,0.008403,0.008547,0.007979,0.003178,0.039724,0.007653,0.000000,0.001992,0.144208,0.033846,...,0.001478,-0.007132,0.017460,0.014483,0.002081,-0.014048,1.0,0.761490,0.729853,1
CORIELL_000783_s1,0.002801,0.000000,0.000000,0.001059,0.001727,0.002551,0.000000,0.000000,0.004728,0.000000,...,-0.005627,-0.006498,-0.001086,-0.014374,-0.000299,0.005867,2.0,0.512500,0.560440,1
CORIELL_004856_s1,0.000000,0.002849,0.002660,0.001059,0.005181,0.000000,0.000000,0.000000,0.000000,0.000000,...,-0.003527,0.015134,-0.002175,0.000688,-0.005928,0.018949,1.0,0.612500,0.552213,0
CORIELL_007135_s1,0.002801,0.000000,0.000000,0.000000,0.003454,0.002551,0.000000,0.000000,0.003546,0.000000,...,0.001130,0.006795,0.004029,-0.005951,-0.007167,-0.005832,2.0,0.775000,0.552213,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PAGE_001054_s1,0.005602,0.008547,0.007979,0.003178,0.012090,0.007653,0.000000,0.000000,0.076832,0.018462,...,-0.004930,0.004625,-0.004943,0.008030,-0.008413,-0.006565,1.0,0.715358,0.638278,1
LCC_000146_s1,0.008403,0.002849,0.002660,0.001059,0.010363,0.002551,0.000000,0.000000,0.008274,0.000000,...,-0.002144,-0.002246,-0.006940,0.002406,0.005469,-0.010250,1.0,0.537500,0.552213,0
CORIELL_002459_s1,0.002801,0.002849,0.002660,0.001059,0.008636,0.005102,0.000000,0.001992,0.004728,0.006154,...,-0.010934,0.003952,0.001125,0.011962,0.006304,0.015792,1.0,0.525000,0.461538,1
BCM_000036_s1,0.002801,0.000000,0.000000,0.000000,0.000000,0.002551,0.000000,0.001992,0.000000,0.003077,...,0.003476,-0.011146,0.016683,0.011988,-0.000492,0.006304,2.0,0.556571,0.552213,1


In [26]:
from GWAS.gwas import calculate_inflation

Plink is found
Plink2 is found
Plink is found
Plink2 is found


In [49]:
calculate_inflation(output.P_VAL)


RUNNING: lambda_calculation



{'pass': True,
 'step': 'lambda_calculation',
 'metrics': {'inflation': 30.206209829338817}}

In [32]:
output.max()

PREDICTOR        ZZZ3
BETA_COEF    0.791209
BETA_SE      0.117274
P_VAL        0.000001
dtype: object

In [None]:
# for label in labels:
#     for dose in dosage_files:
#         tab = f'{cnv_path}/{label}/GP2_round2_{label}_{dose}_gp2ids_cnvwas.tab'
#         test_sumstat = pd.read_csv(tab, sep='\t')
#         n_hits = test_sumstat[test_sumstat.P_VAL<=0.05/25000].shape[0]
#         print(f'{label}, {dose}:')
#         print(f'{n_hits} hits, {calculate_inflation(test_sumstat.P_VAL)} lambda')
#         print()
#         print()

In [27]:
from sklearn.preprocessing import MinMaxScaler
import statsmodels.api as sm
scaler = MinMaxScaler()

cnv_dosage_file = '/data/CARD/PD/GP2/raw_genotypes/GP2_cnvs/release3/CNV_AMR_chr1_PERCENT_BAF_INSERTION_gp2ids.csv'
pheno = '/data/CARD/PD/GP2/raw_genotypes/GP2_cnvs/release3/GP2_AMR_chr1_PERCENT_BAF_INSERTION.pheno'
covar = '/data/CARD/PD/GP2/raw_genotypes/GP2_cnvs/release3/GP2_round3_NEW_MERGE_AMR_clean.cov'
dosage_df = pd.read_csv(cnv_dosage_file)
#fix column names
dosage_df.columns = [x.replace('-','_') for x in dosage_df.columns]
dosage_df.columns = [x.replace('.','_') for x in dosage_df.columns]
dosage_df.columns = [x.replace(' ','_') for x in dosage_df.columns]

pheno_df = pd.read_csv(pheno, sep='\t')
covar_df = pd.read_csv(covar, sep='\t')





if covar_df.age_of_onset.isna().all():
    covar_df.drop(columns=['age_of_onset'], inplace=True)
else:
    covar_df.loc[:,'age_of_onset'] = scaler.fit_transform(covar_df[['age_of_onset']])

if covar_df.age.isna().all():
    covar_df.drop(columns=['age'], inplace=True)
else:
    covar_df.loc[:,'age'] = scaler.fit_transform(covar_df[['age']])

if covar_df.sex.isna().all():
    covar_df.drop(columns=['sex'], inplace=True)

covar_df.drop(columns=['FID'], inplace=True)
covar_df.rename(columns={'GP2sampleID':'sampleid'}, inplace=True)

data_df = dosage_df.merge(covar_df, on='sampleid', how='left').merge(pheno_df, on='sampleid', how='left').set_index('sampleid')

rm_pred = [f'PC{i}' for i in range(1,21)] + ['sex','age_of_onset','age','pheno']

pred_list = [x for x in data_df.columns if x not in rm_pred]
covars_list = [x for x in data_df.columns if x not in pred_list + [f'PC{i}' for i in range(11,21)] + ['pheno']]

results = []
fails = []

for pred in range(len(pred_list)):
    pred_name = pred_list[pred]
    formula = "pheno ~ " + pred_name + " + " + ' + '.join(covars_list)

    fitted = sm.formula.glm(formula=formula, family=sm.families.Binomial(), data=data_df).fit()
    beta_coef  = fitted.params.loc[pred_name]
    beta_se  = fitted.bse.loc[pred_name]
    p_val = fitted.pvalues.loc[pred_name]

    results.append((pred_name, beta_coef, beta_se, p_val))


output = pd.DataFrame(results, columns=('PREDICTOR', 'BETA_COEF', 'BETA_SE','P_VAL'))
# output.to_csv(out_path, sep='\t', header=True, index=False)

In [10]:
pred_name = 'ADAMTSL4-AS1'
formula = "pheno ~ " + pred_name + " + " + ' + '.join(covars_list)
fitted = sm.formula.glm(formula=formula, family=sm.families.Binomial(), data=data_df).fit()

In [30]:
output[output.P_VAL<=0.05/output.shape[0]]

Unnamed: 0,PREDICTOR,BETA_COEF,BETA_SE,P_VAL


In [32]:
0.05/output.shape[0]

1.9546520719311964e-05

In [26]:
dosage_df.columns

Index(['sampleid', 'A3GALT2', 'AADACL3', 'AADACL4', 'ABCA4', 'ABCB10', 'ABCD3',
       'ABL2', 'ACADM', 'ACAP3',
       ...
       'ZNHIT6', 'ZP4', 'ZRANB2', 'ZRANB2_AS1', 'ZRANB2_AS2', 'ZSCAN20',
       'ZSWIM5', 'ZYG11A', 'ZYG11B', 'ZZZ3'],
      dtype='object', length=2559)

In [42]:
def CNV_WAS(cnv_dosage_file, pheno, covar, out_path):
    scaler = MinMaxScaler()
    dosage_df = pd.read_csv(cnv_dosage_file)
    pheno_df = pd.read_csv(pheno, sep='\t')
    covar_df = pd.read_csv(covar, sep='\t')


    if covar_df.age_of_onset.isna().all():
        covar_df.drop(columns=['age_of_onset'], inplace=True)
    else:
        covar_df.loc[:,'age_of_onset'] = scaler.fit_transform(covar_df[['age_of_onset']])

    if covar_df.age.isna().all():
        covar_df.drop(columns=['age'], inplace=True)
    else:
        covar_df.loc[:,'age'] = scaler.fit_transform(covar_df[['age']])

    if covar_df.sex.isna().all():
        covar_df.drop(columns=['sex'], inplace=True)

    covar_df.drop(columns=['FID'], inplace=True)
    covar_df.rename(columns={'GP2sampleID':'sampleid'}, inplace=True)

    data_df = dosage_df.merge(covar_df, on='sampleid', how='left').merge(pheno_df, on='sampleid', how='left').set_index('sampleid')

    rm_pred = [f'PC{i}' for i in range(1,21)] + ['sex','age_of_onset','age','pheno']

    pred_list = [x for x in data_df.columns if x not in rm_pred]
    covars_list = [x for x in data_df.columns if x not in pred_list + [f'PC{i}' for i in range(11,21)] + ['pheno']]

    results = []
    fails = []

    for pred in range(len(pred_list)):
        pred_name = pred_list[pred]
        formula = "pheno ~ " + pred_name + " + " + ' + '.join(covars_list)

        fitted = sm.formula.glm(formula=formula, family=sm.families.Binomial(), data=data_df).fit()
        beta_coef  = fitted.params.loc[pred_name]
        beta_se  = fitted.bse.loc[pred_name]
        p_val = fitted.pvalues.loc[pred_name]

        results.append((pred_name, beta_coef, beta_se, p_val))


    output = pd.DataFrame(results, columns=('PREDICTOR', 'BETA_COEF', 'BETA_SE','P_VAL'))
    output.to_csv(out_path, sep='\t', header=True, index=False)

FID	sampleid	PC1	PC2	PC3	PC4	PC5	PC6	PC7	PC8	PC9	PC10	PC11	PC12	PC13	PC14	PC15	PC16	PC17	PC18	PC19	PC20	sex	age	age_of_onset
0	APGS_000002_s1	0.0055697	5.58512e-05	0.00699746	-0.00260114	-0.0122619	-0.010135	0.00431493	-0.012719	-0.00293343	0.00668866	0.0053588	0.008123	-0.000890374	-0.0141451	0.0119144	0.000390307	-0.00671753	-0.000866958	-0.0013909	0.00290549	1	65.52570310775978	57.25140654302988
0	APGS_000003_s1	0.00326122	-0.00888383	-0.00871568	0.000830058	-0.00808219	-0.00634886	0.00276938	0.00309513	-0.00118999	-0.00757583	0.00328179	0.00329943	0.0057767	-0.0128158	0.0133359	0.000640707	-0.00633914	0.0100417	0.00191561	0.0077697	1	65.52570310775978	57.25140654302988
0	APGS_000005_s1	0.000738409	-0.00199674	0.00794567	0.00732722	0.0111577	0.00256656	0.00058314	-0.00580364	-0.00200246	-0.00346736	0.0119227	0.00547475	0.00136185	-0.0146136	0.00129355	-0.00678011	-0.00567244	0.0212485	-0.00767652	-0.00502232	1	65.52570310775978	57.25140654302988
0	APGS_000006_s1	0.00713627	0.0010006

In [16]:
198/9

22.0