In [None]:
pip install bgzip

In [2]:
import pandas as pd
import dxpy
import os
import bgzip

In [None]:
# Input and output files
INPUT="/path/to/input/burden.tsv" # This file was generated as described https://github.com/deeprob/BMI_monogenic/blob/main/src/genetic_data_processing/burden_preparation/dnanexus_notebooks/0_prepare_burden_file.ipynb
CNV_PATH="/path/to/CNV/calls" 
CNV_FILE='cnv_calls.csv' # Use tht ouput from script 1_Variant calling/UKB/2_CNV_annotation/9_annotate_loeuf.py
OUTPUT_DIR='/path/to/output/directory'
OUTPUT_SNV="output_snv_burden_filename.csv"
OUTPUT_CNV="output_cnv_burden_filename.csv"

In [None]:
# Parse burden data for relevant samples
df=pd.read_csv(INPUT, sep='\t', compression='gzip')

# Annotate variant types
df['Variant_type']=''
df.loc[df.splice_lof, 'Variant_type']='splice'
df.loc[df.missense, 'Variant_type']='missense'
df.loc[df.lof, 'Variant_type']='lof'

df=df[['gene', 'Variant_type', 'samples', 'variants']]
df.loc[df.gene.isnull(), 'gene']=''

In [4]:
# Collapse by variant
df=df[['gene', 'Variant_type', 'samples', 'variants']].groupby('variants').agg(lambda x: ','.join(list(set(x))))
df.reset_index(inplace=True)

In [5]:
# Re-annotate variant type
df['vtype']=''
df.loc[df.Variant_type.str.contains('splice'), 'vtype']='splice'
df.loc[df.Variant_type.str.contains('missense'), 'vtype']='missense'
df.loc[df.Variant_type.str.contains('lof'), 'vtype']='lof'

In [None]:
# Annotate with LOEUF
# The gnomad LOUEF annotations can be downloaded from gnomAD here: https://gnomad.broadinstitute.org/data#v2-constraint
with open('/path/to/gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz', "rb") as raw:
    with bgzip.BGZipReader(raw) as fh:
        loeuf = pd.read_csv(fh, sep="\t")
loeuf=loeuf[['gene', 'oe_lof_upper']]

df=pd.merge(df, loeuf, on='gene')
df['low_loeuf']=df.oe_lof_upper<=0.35

In [7]:
# Collapse table by variant
df['Sample']=df.samples.str.split(',')

sampdf=df.explode('Sample')

burden=sampdf[['Sample', 'vtype', 'variants', 'low_loeuf']].groupby(['Sample', 'vtype', 'low_loeuf']).agg('count')
burden.reset_index(inplace=True)

In [8]:
# Pivot
burden['column']=''
burden.loc[(burden.vtype=='lof') & ~(burden.low_loeuf), 'column']='LOF'
burden.loc[(burden.vtype=='lof') & (burden.low_loeuf), 'column']='LOF_LF'
burden.loc[(burden.vtype=='missense') & ~(burden.low_loeuf), 'column']='Missense'
burden.loc[(burden.vtype=='missense') & (burden.low_loeuf), 'column']='Missense_LF'
burden.loc[(burden.vtype=='splice') & ~(burden.low_loeuf), 'column']='Splice'
burden.loc[(burden.vtype=='splice') & (burden.low_loeuf), 'column']='Splice_LF'

burddf=burden.pivot(index='Sample', columns='column', values='variants')

In [9]:
# Clean up
burddf.fillna(0, inplace=True)
burddf=burddf.astype(int)
burddf.reset_index(inplace=True)

In [10]:
# Add LF counts to non-LF counts
for vtype in ['Missense', 'LOF', 'Splice']:
    burddf[vtype]=burddf[vtype]+burddf[vtype+'_LF']

In [11]:
# Add an All coding SNVs columns (sum of missense, lof, and splice)
burddf['All_coding_SNVs']=burddf[['Missense', 'LOF', 'Splice']].sum(axis=1)
burddf['All_coding_SNVs_LF']=burddf[['Missense_LF', 'LOF_LF', 'Splice_LF']].sum(axis=1)

In [None]:
# Save
burddf.to_csv(OUTPUT_SNV, index=False)
dxpy.upload_local_file(OUTPUT_SNV, folder=OUTPUT_DIR, parents=True)
os.remove(OUTPUT_SNV)

In [None]:
# CNVs
cnv=pd.read_csv(CNV_FILE, sep='\t')
cnv=cnv[cnv.Sample>0]
cnv=cnv[~((cnv['Pathogenic']=='16p12.1') & (cnv['Type']=='DEL'))]
cnv['low_loeuf']=cnv.LOEUF<=0.35

In [15]:
burden=cnv[['Sample', 'Type', 'low_loeuf', 'Gene_ID']].groupby(['Sample', 'Type', 'low_loeuf']).agg('count')
burden.reset_index(inplace=True)

# Pivot
burden['column']=''
burden.loc[(burden['Type']=='DEL') & ~(burden.low_loeuf), 'column']='Genes_del'
burden.loc[(burden['Type']=='DEL') & (burden.low_loeuf), 'column']='Genes_del_LF'
burden.loc[(burden['Type']=='DUP') & ~(burden.low_loeuf), 'column']='Genes_dup'
burden.loc[(burden['Type']=='DUP') & (burden.low_loeuf), 'column']='Genes_dup_LF'

burddf=burden.pivot(index='Sample', columns='column', values='Gene_ID')

In [16]:
# Clean up
burddf.fillna(0, inplace=True)
burddf=burddf.astype(int)
burddf.reset_index(inplace=True)

# Add LF counts to non-LF counts
for vtype in ['Genes_del', 'Genes_dup']:
    burddf[vtype]=burddf[vtype]+burddf[vtype+'_LF']

In [None]:
# Save
burddf.to_csv(OUTPUT_CNV, index=False)
dxpy.upload_local_file(OUTPUT_CNV, folder=OUTPUT_DIR, parents=True)
os.remove(OUTPUT_CNV)