In [1]:
import pandas as pd
import numpy as np

In [2]:
# load in sv2 merged table
# should have columns ['chrom', 'svtype', 'pos', 'end', 'svlength']
# can have more columns
svs = pd.read_csv('output/merged.sv2.breakpoints_resolved.nearby_merged.tsv', sep='\t')

In [3]:
# downloaded from https://gnomad.broadinstitute.org/downloads#v2-structural-variants
gnomad = pd.read_csv('gnomad_v2.1_sv.nonneuro.sites.bed.gz', sep='\t')

# create gnomad column for chromosome with chr
gnomad['chrom'] = gnomad['#chrom'].apply(lambda s: 'chr{}'.format(s))

# filter for only dups and dels
gnomad = gnomad[gnomad.svtype.isin(['DEL', 'DUP'])].copy()

# filter for pass only
gnomad = gnomad[gnomad.FILTER == 'PASS'].copy()

# set gnomad frequency to type float
gnomad['AF'] = gnomad.AF.astype(float)

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
# loop over every chrom and svtype
chromosomes = list(svs.chrom.unique())
svtypes = ['DUP', 'DEL']
svs['gnomad_freq_non_neuro'] = '.'

for chrom in chromosomes:
    for svtype in svtypes:
        # get all strctural variants in chromosome and svtype
        sub_svs = svs[(svs.chrom == chrom) & (svs.svtype == svtype)].copy()
        sub_gnomad = gnomad[(gnomad['chrom'] == chrom) & (gnomad.svtype == svtype)].copy()
        
        # reset index in sub_gnomad
        sub_gnomad = sub_gnomad.reset_index(drop=True)
        
        # create numpy structures for end, start, and svlength
        # (numpy is faster than pandas)
        chrom_ends = sub_gnomad['end'].to_numpy()
        chrom_starts = sub_gnomad['start'].to_numpy()
        chrom_svlengths = chrom_ends - chrom_starts

        # iterate over all structural variants in cohort
        # and compare against other structural variants gnomad
        for i, row in sub_svs.iterrows():
            start = row['start']
            end = row['end']
            length = row['svlength']

            min_end   = np.minimum(chrom_ends, end)
            max_start = np.maximum(chrom_starts, start)
            max_length = np.maximum(chrom_svlengths, length)

            overlap = (min_end - max_start) > .5 * max_length

            # in some cases the structural variant overlaps with more than one gnomad SV
            # in those cases get the sum of all SV that it overlaps with
            allele_frequency = sub_gnomad.loc[overlap]['AF'].sum()
            
            # save to table
            svs.at[i, 'gnomad_freq_non_neuro'] = allele_frequency

In [None]:
svs.to_csv('output/merged.sv2.gnomad_non_neuro.tsv', sep='\t', index=False)