In [1]:
# Merges nearby calls

# Takes in table of SV2 calls 
# and merges samples calls that where the gap length < 20% of the total CNV length
# and the gap is less than 50,000 bp

In [2]:
import pandas as pd

In [3]:
# load in sv2 calls
sv2 = pd.read_csv('output/merged.sv2.breakpoints_resolved.tsv', sep='\t')

In [4]:
# sort by chrom and position
sv2 = sv2.sort_values(['chrom', 'start'])

In [5]:
# get chromsomes, svtypes and samples
chromosomes = list(sv2['chrom'].unique())
svtypes = ['DUP', 'DEL']
samples = list(sv2['sample'].unique())

In [9]:
def should_merge(start1, end1, start2, end2):
    # Function decides if two structural variants should be merged
    
    gap = max(start1, start2) - min(end1, end2)
    
    # if there is no gap the merge
    if gap <= 0 :
        return True
    
    # length of both CNVs together
    total_length = max(end1, end2) - min(start1, start2)
    
    # if gap is smal then merge
    if (gap < 50000) and (gap < (0.2 * total_length)):
        return True
    
    return False

def merge_genes(genes1, genes2):
    # combines gene annotations from two SV calls
    
    genes1 = genes1.split('|')
    genes2 = genes2.split('|')
    
    genes_combined = list(set(genes1 + genes2))
    genes_combined = '|'.join(genes_combined)
    
    return genes_combined

def merge_nearby(df, sample, chrom, svtype):
    # takes a dataframe sorted by starting position
    # iterates through dataframe and merges calls that overlap
    
    # Here, we set the previous start position, end position,
    # length, and genes.
    # p stands for previous
    startp = df.at[df.index[0], 'start']
    endp = df.at[df.index[0], 'end']
    lengthp = df.at[df.index[0], 'svlength']
    genesp = df.at[df.index[0], 'genes']
    
    # will place merged calls here
    output = []
    
    # loop starting with second row in dataframe
    for i in df.index[1:]:
        
        # get current values
        start = df.at[i, 'start']
        end = df.at[i, 'end']
        length = df.at[i, 'svlength']
        genes = df.at[i, 'genes']

        # compare current values to previous values
        merge = should_merge(startp, endp, start, end)
        
        # if should_merge then set previous values to new values
        if merge:
            if genes != genesp:
                genesp = merge_genes(genes, genesp)
                
            startp = min(start, startp)
            endp = max(end, endp)  
            lengthp = endp - startp
            
        # if no merging required then append previous values to output list
        # and then set previous values to current values
        else:
            output.append([sample, svtype, chrom, startp, endp, lengthp, genesp])
            startp, endp, lengthp, genesp = start, end, length, genes

    # append final set of values
    output.append([sample, svtype, chrom, startp, endp, lengthp, genesp])
    
    return output

In [11]:
# loop over every chrom, svtype, and sample
# and merge overlapping structural variants

# store the new records here
new_sv2 = []

# loop over samples
for samp in samples:
    print(samp)
    
    # loop over chrom
    for chrom in chromosomes:

        # loop over svtype
        for svtype in svtypes:

            # get variants for sample, chrom, and svtype
            sub_sv2 = sv2[(sv2.svtype == svtype) & 
                          (sv2['sample'] == samp) & 
                          (sv2['chrom'] == chrom)]
            
            # if no variants, then skip
            if sub_sv2.shape[0] == 0:
                continue
                
            # get merged values
            append = merge_nearby(sub_sv2, samp, chrom, svtype)
            
            # append to new records
            new_sv2 = new_sv2 + append


# new sv2 values to dataframe     
new_sv2 = pd.DataFrame(new_sv2)

SG172
SG021
SG339
SG341
SG141
SG016
SG022
SG024
SG036
SG041
SG042
SG043
SG078
SG103
SG122
SG126
SG127
SG135
SG170
SG175
SG176
SG178
SG188
SG190
SG194
SG198
SG199
SG206
SG208
SG209
SG220
SG227
SG228
SG229
SG233
SG250
SG251
SG257
SG262
SG264
SG265
SG266
SG278
SG290
SG302
SG304
SG305
SG307
SG308
SG309
SG310
SG362
SG059
SG060
SG139
SG146
SG363
SG366
SG369
SG375
SG376
SG377
SG406
SG409
SG410
SG411
SG447
SG450
SG453
SG546
SG548
SG559
SG560
SG213
SG261
SG399
SG477
SG023
SG300
SG065
SG448
SG455
SG470
SG474
SG478
SG555
SG557
SG473
SG077
SG186
SG463
SG466
SG017
SG147
SG475
SG066
SG368
SG404
SG151
SG243
SG340
SG454
SG561
SG394
SG398
SG481
SG479
SG058
SG138
SG145
SG154
SG379
SG380
SG395
SG405
SG449
SG460
SG476
SG480
SG553
SG554
SG556
SG320
SG184
SG155
SG183
SG169
SG249
SG196
SG287
SG201
SG294
SG197
SG234
SG273
SG319
SG321
SG267
SG462
SG464
SG044
SG027
SG232
SG342
SG343
SG344
SG446
SG472
SG231
SG002
SG011
SG079
SG081
SG101
SG124
SG226
SG230
SG295
SG303
SG332
SG335
SG337
SG350
SG031
SG047
SG050
SG10

In [12]:
# name the columns
new_sv2.columns = ['sample', 'svtype', 'chrom', 'start', 'end', 'svlength', 'genes']

In [13]:
# save df 
new_sv2.to_csv('output/merged.sv2.breakpoints_resolved.nearby_merged.tsv', sep='\t', index=False)