In [None]:
# Get's intracohort structural variant count based on 
# 50% reciprocal overlap between structural variant

In [1]:
import pandas as pd
import numpy as np

In [4]:
svs = pd.read_csv('output/merged.PennCNV.tsv', sep='\t')

In [13]:
# loop over every chrom and svtype
chromosomes = list(svs.Chromosome.unique())
svtypes = ['dup', 'del']

for chrom in chromosomes:
    for svtype in svtypes:
        # get all structural variants in chromosome of the svtype
        sub_svs = svs[(svs.Chromosome == chrom) & (svs.Type == svtype)].copy()

        # create numpy structures for end, start, and svlength
        # (numpy is faster than pandas)
        chrom_ends = sub_svs['End'].to_numpy()
        chrom_starts = sub_svs['Start'].to_numpy()
        chrom_svlengths = sub_svs['Length'].to_numpy()

        # iterate over all structural variants in chrom
        # and compare against other structural variants in chrom
        for i, row in sub_svs.iterrows():
            start = row['Start']
            end = row['End']
            length = row['Length']

            min_end   = np.minimum(chrom_ends, end)
            max_start = np.maximum(chrom_starts, start)
            max_length = np.maximum(chrom_svlengths, length)

            overlap = (min_end - max_start) > .5 * max_length

            intra_cohort_count = overlap.sum()
            svs.at[i, 'intracohort_count'] = intra_cohort_count
            
svs['intracohort_count'] = svs['intracohort_count'].astype(int)

In [14]:
svs.to_csv('output/merged.PennCNV.intracohort.tsv', sep='\t', index=False)

In [15]:
svs

Unnamed: 0,Chromosome,Start,End,NumSNPs,Length,Type,StartSNP,EndSNP,PatientID,%SD,%CenTel,Genes,GeneList,intracohort_count
0,chr1,752721,854250,8,101530,dup,rs3131972,rs7537756,SG172,0.9770,1.0000,5,"FAM87B,LINC00115,LINC01128,FAM41C,LOC100130417",1
1,chr1,959842,1026707,16,66866,dup,rs2710888,rs4074137,SG467,0.0107,0.6006,3,"AGRN,RNF223,C1orf159",5
2,chr1,959842,1040026,19,80185,dup,rs2710888,rs6671356,SG469,0.0089,0.5008,3,"AGRN,RNF223,C1orf159",14
3,chr1,962210,1025301,13,63092,del,rs3128126,rs9442400,SG045,0.0000,0.5990,3,"AGRN,RNF223,C1orf159",1
4,chr1,962210,1026707,15,64497,dup,rs3128126,rs4074137,SG404,0.0000,0.5859,3,"AGRN,RNF223,C1orf159",5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2603,chrX,149444023,149593953,23,149931,dup,rs9723612,rs683913,SG334,0.3775,0.0000,1,MAMLD1,1
2604,chrX,151170560,151239285,6,68726,dup,rs11798839,rs1388517,SG399,0.0399,0.0000,0,,1
2605,chrX,152228236,152596189,20,367954,dup,rs1045059,rs12841019,SG482,0.0049,0.0000,3,"PNMA3,PNMA6A,MAGEA1",1
2606,chrX,152480761,152626413,23,145653,dup,rs6627761,rs12841273,SG177,0.0036,0.0000,2,"MAGEA1,ZNF275",1
