## CNV Parsing for RNA paper

We will only use deletions and duplications for this paper.
All other structural variants, such as inversion and translocation, will be ignored. 

 1. all cnvs from PennCNV
 2. CNVnator, lumpy, manta, delly
 
  a. join gaps with lucilla parameters: Adjacent CNVs with overlapping base pairs or gaps with <20% of CNV length and <50 kbp were merged
      
  b. any cnv that is called by two callers
    
Annotathe the CNV genes for frequency of CNVs across families
See if CNVs lead to change in local gene expression.

In [1]:
import pandas as pd
pd.set_option('display.max_rows', 200)

In [2]:
pheno = pd.read_csv('../16p12.2_rnaseq_analysis/data/pheno_final.tsv', sep='\t')

samples = list(pheno.subject.unique())

In [3]:
with open('survivor/all_codes.list', 'r') as f:
    allsamples = f.readlines()
allsamples = [s.strip() for s in allsamples]

In [4]:
recip = 0.5
max_gap = 50e3

In [5]:
def get_svlenth(s):
    l = s.split(';')
    s = 'SVLEN=0'
    for item in l:
        if item.startswith('SVLEN='):
            s = item
    s = s.split('=')[1]
    if s.startswith('-'):
        s = s[1:]
    return int(s)

def get_svtype(s):
    l = s.split(';')
    s = 'SVTYPE=0'
    for item in l:
        if item.startswith('SVTYPE='):
            s = item
    s = s.split('=')[1]
    if s.startswith('-'):
        s = s[1:]
    return s

def get_gt(s):
    l = s.split(':')
    s = l[0]
    return s

def get_end(s):
    l = s.split(';')
    s = 'END=0'
    for item in l:
        if item.startswith('END='):
            s = item
    s = s.split('=')[1]
    if s.startswith('-'):
        s = s[1:]
    return int(s)

In [6]:
chromosomes = ['chr{}'.format(s) for s in list(range(1, 23)) + ['X', 'Y', 'M']]
svtypes = ['DEL', 'DUP']

In [7]:
rsamples = [s for s in allsamples if s not in samples]
rsamples

['SG013',
 'SG016',
 'SG017',
 'SG033',
 'SG036',
 'SG047',
 'SG047',
 'SG050',
 'SG058',
 'SG059',
 'SG060',
 'SG061',
 'SG064',
 'SG065',
 'SG066',
 'SG070',
 'SG077',
 'SG078',
 'SG079',
 'SG080',
 'SG081',
 'SG095',
 'SG096',
 'SG098',
 'SG099',
 'SG101',
 'SG102',
 'SG103',
 'SG107',
 'SG117',
 'SG121',
 'SG122',
 'SG123',
 'SG124',
 'SG125',
 'SG126',
 'SG127',
 'SG128',
 'SG129',
 'SG130',
 'SG134',
 'SG135',
 'SG136',
 'SG137',
 'SG138',
 'SG139',
 'SG140',
 'SG141',
 'SG142',
 'SG144',
 'SG145',
 'SG146',
 'SG147',
 'SG153',
 'SG154',
 'SG163',
 'SG164',
 'SG165',
 'SG166',
 'SG167',
 'SG168',
 'SG169',
 'SG170',
 'SG171',
 'SG172',
 'SG175',
 'SG176',
 'SG177',
 'SG178',
 'SG179',
 'SG180',
 'SG181',
 'SG182',
 'SG183',
 'SG184',
 'SG185',
 'SG186',
 'SG187',
 'SG188',
 'SG189',
 'SG190',
 'SG194',
 'SG195',
 'SG196',
 'SG197',
 'SG198',
 'SG199',
 'SG200',
 'SG201',
 'SG205',
 'SG206',
 'SG207',
 'SG208',
 'SG209',
 'SG210',
 'SG212',
 'SG213',
 'SG214',
 'SG215',
 'SG216',


In [16]:
for sample in allsamples:
    print(sample)

    df1 = pd.read_csv('output/manta/{}.manta.tsv'.format(sample), sep='\t')
    df2 = pd.read_csv('output/lumpy/{}.lumpy.tsv'.format(sample), sep='\t')
    df3 = pd.read_csv('output/delly/{}.delly.tsv'.format(sample), sep='\t')
    df4 = pd.read_csv('output/cnvnator_small/{}.cnvnator_small.tsv'.format(sample), sep='\t')

    df1 = df1.sort_values(['chrom', 'start'])
    df2 = df2.sort_values(['chrom', 'start'])
    df3 = df3.sort_values(['chrom', 'start'])
    df4 = df4.sort_values(['chrom', 'start'])

    df1['caller'] = 'manta'
    df2['caller'] = 'lumpy'
    df3['caller'] = 'delly'
    df4['caller'] = 'cnvnator'

    keep = []

    for chrom in chromosomes[:]:
        for svtype in svtypes:
    #         print(chrom, svtype)
            df1d = df1[(df1.svtype == svtype) & (df1.chrom == chrom)]
            df2d = df2[(df2.svtype == svtype) & (df2.chrom == chrom)]
            df3d = df3[(df3.svtype == svtype) & (df3.chrom == chrom)]
            df4d = df4[(df4.svtype == svtype) & (df4.chrom == chrom)]

            dfd = pd.concat([df1d, df2d, df3d, df4d])

            dfd = dfd.sort_values(['start', 'caller'] )
            dfd = dfd.reset_index(drop=True)

            done_i_list = []

            for i in dfd.index[:]:

                if i in done_i_list:
                    continue

                start = dfd.at[i, 'start']
                end = dfd.at[i, 'end']
                length = dfd.at[i, 'length']
                caller = dfd.at[i, 'caller']
            #     print(i, start, end , length, caller)

                min_end    = dfd['end'].apply(lambda x: min(x, end))
                max_start  = dfd['start'].apply(lambda x: max(x, start))
                max_length = dfd['length'].apply(lambda x: max(x, length))
                overlap    = (min_end - max_start) > .5 * max_length

                if dfd[(overlap) & (dfd.caller != caller)].shape[0] > 0:
                    odf = dfd[(overlap)]
                    keep.append([chrom, odf['start'].max(), odf['end'].min(), svtype, ','.join(odf.caller.unique()), len(odf.caller.unique())])


                    done_i_list = done_i_list + list(odf.index)
                else:
                    done_i_list.append(i)
    keep = pd.DataFrame(keep)
    keep.columns = ['chrom', 'start', 'end', 'svtype', 'caller', 'num_callers']
    keep['length'] = keep['end'] - keep['start']
    keep = keep.sort_values(['chrom', 'start'])

    keep.to_csv('output/merged/{}.merged.tsv'.format(sample), sep='\t', index=False)

SG001
SG002
SG003
SG006
SG007
SG011
SG013
SG016
SG017
SG021
SG022
SG023
SG024
SG025
SG026
SG027
SG030
SG031
SG033
SG036
SG037
SG038
SG039
SG040
SG041
SG042
SG043
SG044
SG045
SG046
SG047
SG047
SG050
SG058
SG059
SG060
SG061
SG064
SG065
SG066
SG069
SG070
SG077
SG078
SG079
SG080
SG081
SG095
SG096
SG098
SG099
SG101
SG102
SG103
SG107
SG117
SG121
SG122
SG123
SG124
SG125
SG126
SG127
SG128
SG129
SG130
SG134
SG135
SG136
SG137
SG138
SG139
SG140
SG141
SG142
SG144
SG145
SG146
SG147
SG148
SG149
SG150
SG151
SG152
SG153
SG154
SG155
SG163
SG164
SG165
SG166
SG167
SG168
SG169
SG170
SG171
SG172
SG175
SG176
SG177
SG178
SG179
SG180
SG181
SG182
SG183
SG184
SG185
SG186
SG187
SG188
SG189
SG190
SG194
SG195
SG196
SG197
SG198
SG199
SG200
SG201
SG205
SG206
SG207
SG208
SG209
SG210
SG212
SG213
SG214
SG215
SG216
SG217
SG218
SG219
SG220
SG221
SG222
SG223
SG224
SG225
SG226
SG227
SG228
SG229
SG230
SG231
SG232
SG233
SG234
SG236
SG237
SG243
SG244
SG245
SG247
SG248
SG249
SG250
SG251
SG252
SG253
SG254
SG255
SG256
SG257
SG25