In [1]:
# Resolves breakpoints from SV2 output.

# Takes in table of SV2 calls 
# and merges samples calls that overlap (Note: no 50% reciprocal overlap requirement).
# The outer breakpoints become the new breakpoints

In [2]:
import pandas as pd

In [3]:
# load in sv2 calls
sv2 = pd.read_csv('output/merged.sv2.tsv', sep='\t')

In [4]:
# only keep a few columns
cols = ['chrom', 'pos', 'record', 'sample', 'end', 'svlength', 'svtype', 'denovo_filter', 'genes']
sv2 = sv2[cols].copy()

# sort by chrom and position
sv2 = sv2.sort_values(['chrom', 'pos'])

In [5]:
# get chromsomes, svtypes and samples
chromosomes = list(sv2['chrom'].unique())
svtypes = ['DUP', 'DEL']
samples = list(sv2['sample'].unique())

In [6]:
def should_merge(start1, end1, start2, end2):
    # Function decides if two structural variants should be merged
    
    overlap = min(end1, end2) - max(start1, start2)
    if overlap >=0 :
        return True
    return False

def merge_genes(genes1, genes2):
    # combines gene annotations from two SV calls
    
    genes1 = genes1.split('|')
    genes2 = genes2.split('|')
    
    genes_combined = list(set(genes1 + genes2))
    genes_combined = '|'.join(genes_combined)
    
    return genes_combined

def merge_nearby(df, sample, chrom, svtype):
    # takes a dataframe sorted by starting position
    # iterates through dataframe and merges calls that overlap
    
    # Here, we set the previous start position, end position,
    # length, and genes.
    # p stands for previous
    startp = df.at[df.index[0], 'pos']
    endp = df.at[df.index[0], 'end']
    lengthp = df.at[df.index[0], 'svlength']
    genesp = df.at[df.index[0], 'genes']
    
    # will place merged calls here
    output = []
    
    # loop starting with second row in dataframe
    for i in df.index[1:]:
        
        # get current values
        start = df.at[i, 'pos']
        end = df.at[i, 'end']
        length = df.at[i, 'svlength']
        genes = df.at[i, 'genes']

        # compare current values to previous values
        merge = should_merge(startp, endp, start, end)
        
        # if should_merge then set previous values to new values
        if merge:
            if genes != genesp:
                genesp = merge_genes(genes, genesp)
                
            startp = min(start, startp)
            endp = max(end, endp)  
            lengthp = endp - startp
            
        # if no merging required then append previous values to output list
        # and then set previous values to current values
        else:
            output.append([sample, svtype, chrom, startp, endp, lengthp, genesp])
            startp, endp, lengthp, genesp = start, end, length, genes

    # append final set of values
    output.append([sample, svtype, chrom, startp, endp, lengthp, genesp])
    
    return output

In [7]:
# loop over every chrom, svtype, and sample
# and merge overlapping structural variants

# store the new records here
new_sv2 = []

# loop over samples
for samp in samples:
    
    # loop over chrom
    for chrom in chromosomes:

        # loop over svtype
        for svtype in svtypes:

            # get variants for sample, chrom, and svtype
            sub_sv2 = sv2[(sv2.svtype == svtype) & 
                          (sv2['sample'] == samp) & 
                          (sv2['chrom'] == chrom)]
            
            # if no variants, then skip
            if sub_sv2.shape[0] == 0:
                continue
                
            # get merged values
            append = merge_nearby(sub_sv2, samp, chrom, svtype)
            
            # append to new records
            new_sv2 = new_sv2 + append


# new sv2 values to dataframe     
new_sv2 = pd.DataFrame(new_sv2)

In [9]:
# name the columns
new_sv2.columns = ['sample', 'svtype', 'chrom', 'start', 'end', 'svlength', 'genes']

In [10]:
# save df 
new_sv2.to_csv('output/merged.sv2.breakpoints_resolved.tsv', sep='\t', index=False)