In [1]:
import pandas as pd
pd.set_option('display.max_rows', 500)


In [2]:
pheno = pd.read_csv('../16p12.2_rnaseq_analysis/data/pheno_final.tsv', sep='\t')

pheno = pheno.drop_duplicates('subject')
samples = list(pheno.subject.unique())

In [3]:
with open('all_codes.list', 'r') as f:
    allsamples = f.readlines()
allsamples = [s.strip() for s in allsamples]

# PennCNV: combine 16p and second hit variants

In [4]:
df_16p = pd.read_csv('PennCNV_16p.csv')
df_rest = pd.read_csv('PennCNV_rare_second_hit.csv')

print(df_16p.shape)
print(df_rest.shape)

(264, 20)
(660, 20)


In [5]:
df = df_16p.append(df_rest).reset_index(drop=True)

In [6]:
df.to_csv('output/penncnv/PennCNV_full.tsv', sep='\t', index=False)

# CNVnator: filter > 50kbp and merge nearby calls

In [7]:
def get_end(s):
    l = s.split(';')
    s = 'END=0'
    for item in l:
        if item.startswith('END='):
            s = item
    s = s.split('=')[1]
    if s.startswith('-'):
        s = s[1:]
    return int(s)

In [8]:
caller='cnvnator_large'
columns = ['chrom','pos','id','ref','alt','qual','filter','info','format','sample']

for sample in allsamples[:]:
#     print(sample)

    filepath = '/data5/16p12_WGS/structural_variants/vcf_callers/output/cnvnator/bin200/cnv2vcf.{}.cnvnator.vcf'
    df = pd.read_csv(filepath.format(sample), 
                     sep='\t', comment='#', header=None, names=columns)

    df['start'] = df['pos']
    df['end'] = df['info'].apply(lambda s: get_end(s))
    df['svlength'] = df['end'] - df['start']
    df['svtype'] = df.alt.apply(lambda s: s.strip('<').strip('>'))
    
    df = df[df.svlength > 50e3]
    
    new_entries = []
    
    for chrom in list(df.chrom.unique())[:]:
        dfc = df[df.chrom == chrom]
        for sv in list(df.svtype.unique()):
#             print(chrom, sv)
            dfcs = dfc[dfc.svtype == sv]
            if dfcs.shape[0] == 0:
                continue
            p_i = dfcs.index[0]

            p_row = dfcs.loc[p_i]
            p_start = p_row['start']
            p_end = p_row['end']
            p_length = p_row['svlength']
            p_svtype = p_row['svtype']

            for i in dfcs.index[1:]:
                row = dfcs.loc[i]
                start = row['start']
                end = row['end']
                length = row['svlength']
                svtype = row['svtype']

                gap = start - p_end
                if (gap < 50e3) & (gap < .2 *max(length, p_length)) & (p_svtype == svtype):
                    p_end = end
                    p_length = p_end-p_start
                new_entries.append([chrom, p_start, p_end, p_svtype])
                p_start, p_end, p_length, p_svtype = start, end, length, svtype
    df_new = pd.DataFrame(new_entries, columns = ['chrom', 'start', 'end', 'svtype'])
    df_new['length'] = df_new.end - df_new.start
    
    
    df_new.to_csv('output/{}/{}.{}.tsv'.format(caller, sample, caller), sep='\t', index=False)

# CNVnator: get Intra-cohort Frequency

In [9]:
cols = ['chrom', 'start', 'end', 'svtype', 'svlength']
dfall = pd.DataFrame(columns = cols)

for subject in allsamples:
    dfa = pd.read_csv('output/cnvnator_large/{}.cnvnator_large.tsv'.format(subject), 
                 sep='\t')
    dfa.columns = cols
    dfa['subject'] = subject
    dfall = dfall.append(dfa)
    
dfall = dfall.reset_index(drop=True)

In [10]:
for subject in allsamples:
#     print(subject)
    df = dfall[dfall.subject == subject].copy()
    rdf = dfall[dfall.subject != subject]
    
    df['intra_cohort_count'] = 0

    for chrom in df.chrom.unique()[:]:
        for svtype in ['DUP', 'DEL']:
            dfc = df[(df.chrom == chrom) & (df.svtype == svtype)]
            rdfc = rdf[(rdf.chrom == chrom) & (rdf.svtype == svtype)]

            for i in dfc.index[:]:
                start = dfc.at[i, 'start']
                end = dfc.at[i, 'end']
                length = dfc.at[i, 'svlength']
                min_end   = rdfc.end.apply(lambda x: min(x, end))
                max_start = rdfc.start.apply(lambda x: max(x, start))
                max_length = rdfc.svlength.apply(lambda x: max(x, length))
                odf = rdfc[(min_end - max_start) > .5 * max_length]

                if odf.shape[0] > 0:
                    count = len(odf.subject.unique())
                    df.at[i, 'intra_cohort_count'] = count

    df['intra_cohort_freq'] = df['intra_cohort_count']/ 345.
    df.to_csv('output/cnvnator_large/{}.intra_cohort.tsv'.format(subject), sep='\t', index=False)

# CNVnator: filter <10 count and combine

In [11]:
pd.read_csv('output/cnvnator_large/{}.intra_cohort.tsv'.format(subject), sep='\t').columns

In [12]:
cols = ['chrom', 'start', 'end', 'svtype', 'svlength', 'subject',
       'intra_cohort_count', 'intra_cohort_freq']
dfall = pd.DataFrame(columns = cols)

for subject in allsamples:
    dfa = pd.read_csv('output/cnvnator_large/{}.intra_cohort.tsv'.format(subject), sep='\t')
    dfa.columns = cols
    dfa['subject'] = subject
    dfall = dfall.append(dfa)
    
dfall = dfall.reset_index(drop=True)

In [13]:
dfall = dfall[dfall.intra_cohort_count < 10]

In [14]:
dfall.to_csv('output/cnvnator_large/cnvnator.filtered_intra.tsv', sep='\t', index=False)

# combine cnvnator and PennCNV

In [15]:
nator = pd.read_csv('output/cnvnator_large/cnvnator.filtered_intra.tsv', sep='\t')
penn  = pd.read_csv('output/penncnv/PennCNV_full.tsv', sep='\t')

nator = nator[['chrom', 'start', 'end', 'svtype', 'subject', 'svlength']].copy()
nator.columns = ['chrom', 'start', 'end', 'svtype', 'subject', 'length']

nator['caller'] = 'cnvnator'

penn = penn[['Chromosome', 'Start', 'End', 'CNV_type', 'Patient_ID']].copy()
penn['length'] = penn.End - penn.Start
penn.columns = ['chrom', 'start', 'end', 'svtype', 'subject', 'length']
penn['caller'] = 'pennCNV'
penn.svtype = penn.svtype.apply(lambda s: s.upper())

In [16]:
keep = []

for subject in allsamples:
#     print(subject)
    snator = nator[nator.subject == subject].copy()
    spenn = penn[penn.subject == subject].copy()
    
    chromosomes = list(set(snator.chrom.to_list() + spenn.chrom.to_list()))
#     print(chromosomes)
    for chrom in chromosomes:
        for svtype in ['DUP', 'DEL']:
            snatord = snator[(snator.svtype == svtype) & (snator.chrom == chrom)]
            spennd = spenn[(spenn.svtype == svtype) & (spenn.chrom == chrom)]
            
            dfd = pd.concat([snatord, spennd])
            if dfd.shape[0] == 0:
                continue
            
            dfd = dfd.sort_values(['start', 'caller'] )
            dfd = dfd.reset_index(drop=True)

            done_i_list = []

            for i in dfd.index[:]:

                if i in done_i_list:
                    continue
                start = dfd.at[i, 'start']
                end = dfd.at[i, 'end']
                length = dfd.at[i, 'length']
                caller = dfd.at[i, 'caller']
#                 print(i, start, end , length, caller)

                min_end    = dfd['end'].apply(lambda x: min(x, end))
                max_start  = dfd['start'].apply(lambda x: max(x, start))
                max_length = dfd['length'].apply(lambda x: max(x, length))
                overlap    = (min_end - max_start) > .5 * max_length

                if dfd[(overlap) & (dfd.caller != caller)].shape[0] > 0:
                    odf = dfd[(overlap)]
                    keep.append([chrom, odf['start'].min(), odf['end'].max(), subject, svtype, ','.join(odf.caller.unique()), len(odf.caller.unique())])


                    done_i_list = done_i_list + list(odf.index)
                else:
                    odf = dfd[(overlap)]
                    keep.append([chrom, odf['start'].min(), odf['end'].max(), subject, svtype, ','.join(odf.caller.unique()), len(odf.caller.unique())])

                    done_i_list.append(i)
                    
keep = pd.DataFrame(keep)
keep.columns = ['chrom', 'start', 'end', 'subject', 'svtype', 'caller', 'num_callers']
keep['length'] = keep['end'] - keep['start']
keep = keep.sort_values(['chrom', 'start'])

In [17]:
keep.to_csv('output/merged_greater_50/merged.tsv', sep='\t', index=False)

# gene annotation

In [18]:
df = pd.read_csv('output/merged_greater_50/merged.tsv', sep='\t')

In [19]:
gdf = pd.read_csv('refGene_hg19.txt', sep='\t', header=None)

gdf['chrom'] = gdf[2]
gdf['start'] = gdf[4]
gdf['end']   = gdf[5]
gdf['gene']  = gdf[12]
gdf = gdf[['chrom', 'start', 'end', 'gene']]

In [20]:
df['genes'] = ''

for chrom in df.chrom.unique():
#     print(chrom)
    dfc = df[df.chrom == chrom]
    gdfc = gdf[gdf.chrom == chrom]
    for i in dfc.index[:]:
        start = dfc.at[i, 'start']
        end = dfc.at[i, 'end']
        min_end   = gdfc.end.apply(lambda x: min(x, end))
        max_start = gdfc.start.apply(lambda x: max(x, start))
        odf = gdfc[(min_end - max_start) > 0]
        if odf.shape[0] > 0:
            df.at[i, 'genes'] = ';'.join(odf.gene.tolist())

In [21]:
df.to_csv('output/merged_greater_50/merged.tsv', sep='\t', index=False)