## CNV Parsing for RNA paper

We will only use deletions and duplications for this paper.
All other structural variants, such as inversion and translocation, will be ignored. 

 1. all cnvs from PennCNV
 2. CNVnator, lumpy, manta, delly
 
  a. join gaps with lucilla parameters: Adjacent CNVs with overlapping base pairs or gaps with <20% of CNV length and <50 kbp were merged
      
  b. any cnv that is called by two callers
    
Annotathe the CNV genes for frequency of CNVs across families
See if CNVs lead to change in local gene expression.

In [1]:
import pandas as pd
pd.set_option('display.max_rows', 200)

In [3]:
pheno = pd.read_csv('/data5/16p12_RNA/scripts/16p12.2_rnaseq_analysis/data/pheno_final.tsv', sep='\t')

samples = list(pheno.subject.unique())

In [4]:
with open('survivor/all_codes.list', 'r') as f:
    allsamples = f.readlines()
allsamples = [s.strip() for s in allsamples]

In [5]:
recip = 0.5
max_gap = 50e3

In [6]:
def get_svlenth(s):
    l = s.split(';')
    s = 'SVLEN=0'
    for item in l:
        if item.startswith('SVLEN='):
            s = item
    s = s.split('=')[1]
    if s.startswith('-'):
        s = s[1:]
    return int(s)

def get_svtype(s):
    l = s.split(';')
    s = 'SVTYPE=0'
    for item in l:
        if item.startswith('SVTYPE='):
            s = item
    s = s.split('=')[1]
    if s.startswith('-'):
        s = s[1:]
    return s

def get_gt(s):
    l = s.split(':')
    s = l[0]
    return s

def get_end(s):
    l = s.split(';')
    s = 'END=0'
    for item in l:
        if item.startswith('END='):
            s = item
    s = s.split('=')[1]
    if s.startswith('-'):
        s = s[1:]
    return int(s)

# cnvnator_small

In [9]:
caller='cnvnator_small'

for sample in allsamples[:]:
    columns = ['chrom','pos','id','ref','alt','qual','filter','info','format','sample']

    filepath = '/data5/16p12_WGS/structural_variants/vcf_callers/output/cnvnator/bin200/cnv2vcf.{}.cnvnator.vcf'

    df = pd.read_csv(filepath.format(sample), 
                     sep='\t', comment='#', header=None, names=columns)

    
    df['svtype'] = df['info'].apply(get_svtype)
    df['start'] = df.pos.astype(int)
    df['end'] = df['info'].apply(get_end)
    df = df.sort_values(['chrom', 'start'])
    df['GT'] = df['sample'].apply(get_gt)
    df['svlength'] = df['end'] - df['start']
    
    df = df[df.svlength < 50e3]

    df = df[df.svtype.apply(lambda s: (s == 'DEL') or (s == 'DUP'))]
    new_entries = []
    
    for chrom in list(df.chrom.unique())[:]:
        dfc = df[df.chrom == chrom]
        for sv in list(df.svtype.unique()):
#             print(chrom, sv)
            dfcs = dfc[dfc.svtype == sv]
            if dfcs.shape[0] == 0:
                continue
            p_i = dfcs.index[0]

            p_row = dfcs.loc[p_i]
            p_start = p_row['start']
            p_end = p_row['end']
            p_length = p_row['svlength']
            p_svtype = p_row['svtype']

            for i in dfcs.index[1:]:
                row = dfcs.loc[i]
                start = row['start']
                end = row['end']
                length = row['svlength']
                svtype = row['svtype']

                gap = start - p_end
                if (gap < 50e3) & (gap < .2 *max(length, p_length)) & (p_svtype == svtype):
                    p_end = end
                    p_length = p_end-p_start
                new_entries.append([chrom, p_start, p_end, p_svtype])
                p_start, p_end, p_length, p_svtype = start, end, length, svtype

    df_new = pd.DataFrame(new_entries, columns = ['chrom', 'start', 'end', 'svtype'])
    df_new['length'] = df_new.end - df_new.start
    
    df_new.to_csv('output/{}/{}.{}.tsv'.format(caller, sample, caller), sep='\t', index=False)

# manta

In [50]:
caller='manta'

for sample in allsamples[:]:
    columns = ['chrom','pos','id','ref','alt','qual','filter','info','format','sample']

    filepath = '/data5/16p12_WGS/structural_variants/vcf_callers/output/manta/{}.manta.vcf'

    df = pd.read_csv(filepath.format(sample), 
                     sep='\t', comment='#', header=None, names=columns)

    
    df['svtype'] = df['info'].apply(get_svtype)

    df['start'] = df.pos.astype(int)
    df['end'] = df['info'].apply(get_end)
    df = df.sort_values(['chrom', 'start'])
    df['GT'] = df['sample'].apply(get_gt)
    df['svlength'] = df['end'] - df['start']
    
    df = df[df.svlength < 50e3]

    df = df[df.svtype.apply(lambda s: (s == 'DEL') or (s == 'DUP'))]
    new_entries = []
    
    for chrom in list(df.chrom.unique())[:]:
        dfc = df[df.chrom == chrom]
        for sv in list(df.svtype.unique()):
#             print(chrom, sv)
            dfcs = dfc[dfc.svtype == sv]
            if dfcs.shape[0] == 0:
                continue
            p_i = dfcs.index[0]

            p_row = dfcs.loc[p_i]
            p_start = p_row['start']
            p_end = p_row['end']
            p_length = p_row['svlength']
            p_svtype = p_row['svtype']

            for i in dfcs.index[1:]:
                row = dfcs.loc[i]
                start = row['start']
                end = row['end']
                length = row['svlength']
                svtype = row['svtype']

                gap = start - p_end
                if (gap < 50e3) & (gap < .2 *max(length, p_length)) & (p_svtype == svtype):
                    p_end = end
                    p_length = p_end-p_start
                new_entries.append([chrom, p_start, p_end, p_svtype])
                p_start, p_end, p_length, p_svtype = start, end, length, svtype

    df_new = pd.DataFrame(new_entries, columns = ['chrom', 'start', 'end', 'svtype'])
    df_new['length'] = df_new.end - df_new.start
    
    df_new.to_csv('output/{}/{}.{}.tsv'.format(caller, sample, caller), sep='\t', index=False)

# delly

In [52]:
caller='delly'

for sample in allsamples[:]:
    columns = ['chrom','pos','id','ref','alt','qual','filter','info','format','sample']

    filepath = '/data5/16p12_WGS/structural_variants/vcf_callers/output/delly/{}.delly.vcf'

    df = pd.read_csv(filepath.format(sample), 
                     sep='\t', comment='#', header=None, names=columns)

    
    df['svtype'] = df['info'].apply(get_svtype)

    df['start'] = df.pos.astype(int)
    df['end'] = df['info'].apply(get_end)
    df = df.sort_values(['chrom', 'start'])
    df['GT'] = df['sample'].apply(get_gt)
    df['svlength'] = df['end'] - df['start']
    
    df = df[df.svlength < 50e3]

    df = df[df.svtype.apply(lambda s: (s == 'DEL') or (s == 'DUP'))]
    new_entries = []
    
    for chrom in list(df.chrom.unique())[:]:
        dfc = df[df.chrom == chrom]
        for sv in list(df.svtype.unique()):
#             print(chrom, sv)
            dfcs = dfc[dfc.svtype == sv]
            if dfcs.shape[0] == 0:
                continue
            p_i = dfcs.index[0]

            p_row = dfcs.loc[p_i]
            p_start = p_row['start']
            p_end = p_row['end']
            p_length = p_row['svlength']
            p_svtype = p_row['svtype']

            for i in dfcs.index[1:]:
                row = dfcs.loc[i]
                start = row['start']
                end = row['end']
                length = row['svlength']
                svtype = row['svtype']

                gap = start - p_end
                if (gap < 50e3) & (gap < .2 *max(length, p_length)) & (p_svtype == svtype):
                    p_end = end
                    p_length = p_end-p_start
                new_entries.append([chrom, p_start, p_end, p_svtype])
                p_start, p_end, p_length, p_svtype = start, end, length, svtype

    df_new = pd.DataFrame(new_entries, columns = ['chrom', 'start', 'end', 'svtype'])
    df_new['length'] = df_new.end - df_new.start
    
    df_new.to_csv('output/{}/{}.{}.tsv'.format(caller, sample, caller), sep='\t', index=False)

# lumpy

In [56]:
caller='lumpy'

for sample in allsamples[:]:
    if sample == 'SG231':
        continue
    columns = ['chrom','pos','id','ref','alt','qual','filter','info','format','sample']

    filepath = '/data5/16p12_WGS/structural_variants/vcf_callers/output/smoove/{}-smoove.vcf.gz'

    df = pd.read_csv(filepath.format(sample), 
                     sep='\t', comment='#', header=None, names=columns)

    
#     df['svlength'] = df['info'].apply(get_svlenth)
    df['svtype'] = df['info'].apply(get_svtype)

    df['start'] = df.pos.astype(int)
    df['end'] = df['info'].apply(get_end)
    df = df.sort_values(['chrom', 'start'])
    df['GT'] = df['sample'].apply(get_gt)
    df['svlength'] = df['end'] - df['start']
    
    df = df[df.svlength < 50e3]

    df = df[df.svtype.apply(lambda s: (s == 'DEL') or (s == 'DUP'))]
    new_entries = []
    
    for chrom in list(df.chrom.unique())[:]:
        dfc = df[df.chrom == chrom]
        for sv in list(df.svtype.unique()):
#             print(chrom, sv)
            dfcs = dfc[dfc.svtype == sv]
            if dfcs.shape[0] == 0:
                continue
            p_i = dfcs.index[0]

            p_row = dfcs.loc[p_i]
            p_start = p_row['start']
            p_end = p_row['end']
            p_length = p_row['svlength']
            p_svtype = p_row['svtype']

            for i in dfcs.index[1:]:
                row = dfcs.loc[i]
                start = row['start']
                end = row['end']
                length = row['svlength']
                svtype = row['svtype']

                gap = start - p_end
                if (gap < 50e3) & (gap < .2 *max(length, p_length)) & (p_svtype == svtype):
                    p_end = end
                    p_length = p_end-p_start
                new_entries.append([chrom, p_start, p_end, p_svtype])
                p_start, p_end, p_length, p_svtype = start, end, length, svtype

    df_new = pd.DataFrame(new_entries, columns = ['chrom', 'start', 'end', 'svtype'])
    df_new['length'] = df_new.end - df_new.start
    
    df_new.to_csv('output/{}/{}.{}.tsv'.format(caller, sample, caller), sep='\t', index=False)