In [1]:
# Get's intracohort structural variant count based on 
# 50% reciprocal overlap between structural variant

In [1]:
import pandas as pd
import numpy as np

In [2]:
svs = pd.read_csv('output/merged.sv2.breakpoints_resolved.nearby_merged.tsv', sep='\t')

In [None]:
# loop over every chrom and svtype
chromosomes = list(svs.chrom.unique())
svtypes = ['DUP', 'DEL']

for chrom in chromosomes:
    for svtype in svtypes:
        # get all structural variants in chromosome of the svtype
        sub_svs = svs[(svs.chrom == chrom) & (svs.svtype == svtype)].copy()

        # create numpy structures for end, start, and svlength
        # (numpy is faster than pandas)
        chrom_ends = sub_svs['end'].to_numpy()
        chrom_starts = sub_svs['start'].to_numpy()
        chrom_svlengths = sub_svs['svlength'].to_numpy()

        # iterate over all structural variants in chrom
        # and compare against other structural variants in chrom
        for i, row in sub_svs.iterrows():
            start = row['start']
            end = row['end']
            length = row['svlength']

            min_end   = np.minimum(chrom_ends, end)
            max_start = np.maximum(chrom_starts, start)
            max_length = np.maximum(chrom_svlengths, length)

            overlap = (min_end - max_start) > .5 * max_length

            intra_cohort_count = overlap.sum()
            svs.at[i, 'intracohort_count'] = intra_cohort_count

In [None]:
svs.to_csv('output/merged.sv2.intracohort.tsv', sep='\t', index=False)