Continuing the work from 12/28, where we were developing the algorithm to calculate de novo and inherited using calls from each tool:

# Filtering variants

In [8]:
# for each trio
    # for each CNV in the child
        # if CNV start is not within any parents boundary AND
        # CNV end is not within any parents boundary
            # it's denovo

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True)
%matplotlib inline

In [2]:
# figure out who is who in each trio
import glob
ped_file = '/data/sudregp/cnv/simplex.ped'
wes_prefix = ['CLIA', 'CCGO', 'WPS']
trios = []
affected = []
controls = []
samples = []
famids = []
fid = open(ped_file, 'r')
for line in fid:
    famid, sid, fa, mo, sex, aff = line.rstrip().split('\t')
    if fa[:4] in wes_prefix and mo[:4] in wes_prefix and sid[:4] in wes_prefix:
        fam = {}
        fam['child'] = sid
        if aff == '1':
            affected.append(sid)
        else:
            controls.append(sid)
        fam['father'] = fa
        fam['mother'] = mo
        fam['famid'] = famid
        trios.append(fam)
        samples += [sid, fa, mo]
        famids.append(famid)
fid.close()
samples = set(samples)
famids = set(famids)

fid = open('/data/sudregp/cnv/kid_samples.txt', 'r')
good_kids = [line.rstrip() for line in fid]
fid.close()

In [3]:
fid = open('/data/sudregp/cnv/conifer/calls.txt', 'r')
cnvs = {}
for sample in samples:
    cnvs[sample] = []
for line in fid:
    if line.find('sampleID') < 0:
        sample, chr, start, end, cnvtype = line.rstrip().split('\t')
        cnvs[sample.split('.')[0]].append([chr, start, end, cnvtype])
fid.close()

In [4]:
inherited = {}
denovo = {}
for trio in trios:
    inherited[trio['child']] = []
    denovo[trio['child']] = []
    
# need to filter in chr and type first because ranges repeat for every chr,
# and inheritance is only true for the same type of CNV
for chr in ['chr%d' % d for d in range(1, 23)]:
    for cnvtype in ['dup', 'del']:
        for trio in trios:
            kid = [cnv for cnv in cnvs[trio['child']] if (cnv[0] == chr and
                                                          cnv[3] == cnvtype)]
            dad = [cnv for cnv in cnvs[trio['father']] if (cnv[0] == chr and
                                                          cnv[3] == cnvtype)]
            mom = [cnv for cnv in cnvs[trio['mother']] if (cnv[0] == chr and
                                                          cnv[3] == cnvtype)]
            for cnv in kid:
                is_inherited = False
                # if CNV start is not within any parents boundary AND
                #         CNV end is not within any parents boundary
                # it's denovo
                for pcnv in dad + mom:
                    if ((cnv[1] >= pcnv[1] or cnv[1] <= pcnv[2]) or
                        (cnv[2] >= pcnv[1] or cnv[2] <= pcnv[2])):
                        is_inherited = True
                if is_inherited:
                    inherited[trio['child']].append(cnv)
                else:
                    denovo[trio['child']].append(cnv)
        
        

In [13]:
for child in affected + controls:
    tmp_cnv = [cnv for cnv in cnvs[child] if cnv[0] in ['chr%d' % d for d in range(1, 23)]]
    print len(inherited[child]), len(denovo[child]), len(tmp_cnv)

31 2 33
46 1 47
48 3 51
33 5 38
41 2 43
28 1 29
23 2 25
41 4 45
31 2 33
22 7 29
23 1 24
38 3 41
27 2 29
55 1 56
33 1 34
35 4 39
24 0 24
5 2 7
32 1 33
37 6 43
28 1 29
25 2 27
64 2 66
28 10 38
39 2 41
51 3 54
28 3 31
34 5 39
18 4 22
46 5 51
33 4 37
17 9 26
11 4 15
13 4 17
33 3 36
45 8 53
55 5 60
32 1 33
62 0 62
30 2 32
64 7 71
52 1 53
32 2 34
31 2 33
32 2 34
1 1 2
45 2 47
50 0 50
44 3 47
40 2 42
20 6 26


OK, this is working. Let's make it into a function so that we can feed in any sort of CNVs:

In [14]:
def classify_cnvs(cnvs, trios):
    inherited = {}
    denovo = {}
    for trio in trios:
        inherited[trio['child']] = []
        denovo[trio['child']] = []

    # need to filter in chr and type first because ranges repeat for every chr,
    # and inheritance is only true for the same type of CNV
    for chr in ['chr%d' % d for d in range(1, 23)]:
        for cnvtype in ['dup', 'del']:
            for trio in trios:
                kid = [cnv for cnv in cnvs[trio['child']] if (cnv[0] == chr and
                                                              cnv[3] == cnvtype)]
                dad = [cnv for cnv in cnvs[trio['father']] if (cnv[0] == chr and
                                                              cnv[3] == cnvtype)]
                mom = [cnv for cnv in cnvs[trio['mother']] if (cnv[0] == chr and
                                                              cnv[3] == cnvtype)]
                for cnv in kid:
                    is_inherited = False
                    # if CNV start is not within any parents boundary AND
                    #         CNV end is not within any parents boundary
                    # it's denovo
                    for pcnv in dad + mom:
                        if ((cnv[1] >= pcnv[1] or cnv[1] <= pcnv[2]) or
                            (cnv[2] >= pcnv[1] or cnv[2] <= pcnv[2])):
                            is_inherited = True
                    if is_inherited:
                        inherited[trio['child']].append(cnv)
                    else:
                        denovo[trio['child']].append(cnv)
    return denovo, inherited

In [15]:
dn, inh = classify_cnvs(cnvs, trios)

Now we can write out the PLINK files for each tool, as if we had run PSEQ:

In [28]:
def write2plink(data, fname, trios):
    fid = open(fname, 'w')
    fid.write('FID\tIID\tCHR\tBP1\tBP2\tTYPE\tSCORE\tSITE\n')
    for sample, cnvs in data.iteritems():
        famid = [trio['famid'] for trio in trios if trio['child'] == sample][0]
        for cnv in cnvs:
            ch = cnv[0].replace('chr', '')
            if cnv[3] == 'del':
                cnvt = '1'
            else:
                cnvt = '3'
            fid.write('\t'.join([famid, sample, ch, cnv[1], cnv[2], cnvt]) +
                      '\t0\t0\n')
    fid.close()

In [29]:
write2plink(dn, '/data/sudregp/cnv/xhmm/test.plink', trios)

OK, everything is working. Now it's just a matter of running PLINK in all 3 tools.

# CONiFer

In [30]:
fid = open('/data/sudregp/cnv/conifer/calls.txt', 'r')
cnvs = {}
for sample in samples:
    cnvs[sample] = []
for line in fid:
    if line.find('sampleID') < 0:
        sample, chr, start, end, cnvtype = line.rstrip().split('\t')
        cnvs[sample.split('.')[0]].append([chr, start, end, cnvtype])
fid.close()

In [35]:
dn, inh = classify_cnvs(cnvs, trios)
write2plink(dn, '/data/sudregp/cnv/conifer/denovo.cnv', trios)
write2plink(inh, '/data/sudregp/cnv/conifer/inherited.cnv', trios)
tmp_cnv = {}
for sample in cnvs.iterkeys():
    if sample in affected + controls:
        tmp_cnv[sample] = cnvs[sample]
write2plink(tmp_cnv, '/data/sudregp/cnv/conifer/all.cnv', trios)

In [40]:
%%bash

module load plink/1.07
cd /data/sudregp/cnv/conifer/
cat ../penncnv/wellknown_region_hg19 > bad_regions.list

for cnvtype in all denovo inherited; do
    cnvname=${cnvtype}.cnv
    plink --cnv-list $cnvname --cnv-make-map --noweb --out ${cnvtype};
    
    # remove bad regions
    plink --map ${cnvname}.map --fam ../simplex.ped --cnv-list $cnvname \
        --noweb --1 --cnv-exclude bad_regions.list --cnv-overlap .5 \
        --cnv-write --out ${cnvtype}_clean
    plink --cnv-list ${cnvtype}_clean.cnv --cnv-make-map --noweb --1 \
        --out ${cnvtype}_clean
    
    for qc in '' '_clean'; do
        cnvname=${cnvtype}${qc}.cnv
        # whole burden
        plink --map ${cnvname}.map --fam ../simplex.ped --cnv-list $cnvname \
            --noweb --1 --cnv-check-no-overlap --out ${cnvtype}${qc}_burden;
        # gene sets
        plink --map ${cnvname}.map --fam ../simplex.ped --cnv-list $cnvname \
            --noweb --1 --cnv-intersect glist-hg19 --cnv-verbose-report-regions \
            --cnv-subset genes.txt --out ${cnvtype}${qc}_genes;
        plink --map ${cnvname}.map --fam ../simplex.ped --cnv-list $cnvname \
            --noweb --1 --cnv-intersect glist-hg19 --cnv-verbose-report-regions \
            --cnv-subset hg19_allenBrainGene_trimmed.txt \
            --out ${cnvtype}${qc}_brainGenes;
        # subtypes only
        for sub in del dup; do
            plink --map ${cnvname}.map --fam ../simplex.ped --cnv-list $cnvname \
            --noweb --1 --cnv-${sub} --out ${cnvtype}${qc}_${sub}Burden;
            # gene sets
            plink --map ${cnvname}.map --fam ../simplex.ped --cnv-list $cnvname \
                --noweb --1 --cnv-intersect glist-hg19 --cnv-verbose-report-regions \
                --cnv-subset genes.txt --cnv-${sub} \
                --out ${cnvtype}${qc}_${sub}Genes;
            plink --map ${cnvname}.map --fam ../simplex.ped --cnv-list $cnvname \
                --noweb --1 --cnv-intersect glist-hg19 --cnv-verbose-report-regions \
                --cnv-subset hg19_allenBrainGene_trimmed.txt --cnv-${sub} \
                --out ${cnvtype}${qc}_${sub}BrainGenes;
        done;
    done;
done


@----------------------------------------------------------@
|        PLINK!       |     v1.07      |   10/Aug/2009     |
|----------------------------------------------------------|
|  (C) 2009 Shaun Purcell, GNU General Public License, v2  |
|----------------------------------------------------------|
|  For documentation, citation & bug-report instructions:  |
|        http://pngu.mgh.harvard.edu/purcell/plink/        |
@----------------------------------------------------------@

Skipping web check... [ --noweb ] 
Writing this text to log file [ all.log ]
Analysis started: Tue Jan  2 12:49:48 2018

Options in effect:
	--cnv-list all.cnv
	--cnv-make-map
	--noweb
	--out all


Reading segment list (CNVs) from [ all.cnv ]
Writing new MAP file to [ all.cnv.map ]
Wrote 5264 unique positions to file

Analysis finished: Tue Jan  2 12:49:48 2018


@----------------------------------------------------------@
|        PLINK!       |     v1.07      |   10/Aug/2009     |
|---------------------

Now we just do some plotting:

In [41]:
def plot_plink_cnvs(fname, t_str, verbose=False):
    
    import plotly.graph_objs as go
    from plotly import tools

    df = pd.read_table(fname, delimiter='\s+', index_col=1)
    df.head()

    x_red, x_green, red, green, xticks = [], [], [], [], []
    red_text, green_text = [], []

    # loop through families
    f = 0
    for fam in famids:
        fam_kids = [t['child'] for t in trios if t['famid'] == fam]
        found = False
        for kid in fam_kids:
            if kid in good_kids:
                found = True
                idx = [i for i in range(df.shape[0]) if df.index[i].find(kid) >= 0][0]
                if kid in affected:
                    red.append(int(df.iloc[idx]['NSEG']))
                    x_red.append(f)
                    red_text.append(kid)
                else:
                    green.append(int(df.iloc[idx]['NSEG']))
                    x_green.append(f)
                    green_text.append(kid)
        # only increase counter if we added a kid
        if found:
            xticks.append(fam)
            f += 1

    fig = go.Figure()
                
    trace0 = go.Scatter(
        x = x_red,
        y = red,
        mode = 'markers',
        name = 'affected',
        marker = dict(size = 10, color = 'red'),
        text = red_text,
        hoverinfo='text+y',
        showlegend = True
    )
    trace1 = go.Scatter(
        x = x_green,
        y = green,
        mode = 'markers',
        name = 'unaffected',
        marker = dict(size = 10, color = 'green'),
        hovertext = green_text,
        hoverinfo='text+y',
        showlegend = True
    )
    fig['data'] = [trace0, trace1]
    fig['layout'].update(height=400, width=800, title=t_str,
                             xaxis1=dict(tickvals=range(len(xticks)),
                                        ticktext=xticks,
                                        zeroline = False),
                             hovermode='closest')
    iplot(fig)
    
    # print family order
    if verbose:
        score = []
        for x in range(max(x_green + x_red)):
            idx = [i for i, val in enumerate(x_red) if val == x]
            if len(idx) == 0:
                aff = 0
            else:
                aff = red[idx[0]]
            idx = [i for i, val in enumerate(x_green) if val == x]
            if len(idx) == 0:
                unaff = 0
            else:
                unaff = max([green[i] for i in idx])
            score.append(aff - unaff)
        order = np.argsort(score)[::-1]
        fam_names = fig.layout.xaxis1['ticktext']
        disrupted = [fam_names[v] for v in order if score[v] > 0]
        print 'Best disrupted:', ', '.join(disrupted)
        protected = [fam_names[v] for v in order[::-1] if score[v] < 0]
        print 'Best protected:', ', '.join(protected)

In [43]:
import glob

data_dir = '/home/sudregp/data/cnv/conifer/'
files = glob.glob(data_dir + '/denovo*cnv.indiv')
for f in files:
    t_str = '.'.join(f.split('/')[-1].split('.')[:-2])
    plot_plink_cnvs(f, t_str, verbose=True)

Best disrupted: 10197, 10448, 1895, 10178, 10042, 10173, 10369
Best protected: 10153, 1893, 10182, 10041, 10033, 10164, 10215, 10128, 10131, 10459, 10090, 10406, 10094, 1892


Best disrupted: 10448, 10197, 10178
Best protected: 1893, 10182, 10041, 10131, 10153, 10090, 10033, 10164, 10406, 855, 10459, 10215, 10094


Best disrupted: 1895, 855, 10090, 10197, 10042, 10173, 10369, 10131, 1893
Best protected: 10153, 10033, 10128, 10041, 10215, 10459, 1892, 10182


Best disrupted: 
Best protected: 10041, 10164, 10215


Best disrupted: 
Best protected: 10041, 10164, 10215


Best disrupted: 
Best protected: 


Best disrupted: 10197, 10448, 10094, 10178, 10369
Best protected: 10153, 1893, 10182, 10041, 10033, 10164, 10215, 10128, 10459, 10406, 1892, 10090


Best disrupted: 
Best protected: 10406, 10153


Best disrupted: 10197, 10448, 10178
Best protected: 1893, 10182, 10041, 10164, 10131, 10153, 10033, 10090, 10406, 855, 10459, 10215


Best disrupted: 
Best protected: 10406


Best disrupted: 10131, 10094, 10369, 10090, 855
Best protected: 10153, 10128, 10041, 10182, 10033, 10459, 1892, 10215


Best disrupted: 
Best protected: 10153


Best disrupted: 
Best protected: 10041, 10164, 10215


Best disrupted: 
Best protected: 


Best disrupted: 
Best protected: 10041, 10164, 10215


Best disrupted: 
Best protected: 


Best disrupted: 
Best protected: 


Best disrupted: 
Best protected: 


In [44]:
import glob

data_dir = '/home/sudregp/data/cnv/conifer/'
files = glob.glob(data_dir + '/all*cnv.indiv')
for f in files:
    t_str = '.'.join(f.split('/')[-1].split('.')[:-2])
    plot_plink_cnvs(f, t_str, verbose=True)

Best disrupted: 10042, 1893, 855, 10369, 10090, 10033, 10131
Best protected: 10459, 10041, 10182, 10153, 10164, 1892, 10406, 10197, 10215, 10178, 10448, 10173, 10094, 10128, 1895


Best disrupted: 10042, 10128, 10090, 1893, 855, 1892, 10197, 10369
Best protected: 10041, 10182, 10459, 10153, 10164, 10448, 10033, 10173, 1895, 10178, 10094, 10215, 10131


Best disrupted: 10042, 1893, 10033, 855, 10131, 10369, 10448
Best protected: 10459, 10153, 1892, 10182, 10041, 10406, 10197, 10164, 10215, 10128, 10178, 10094, 10090, 10173


Best disrupted: 10094, 10033, 855
Best protected: 10164, 10182, 10197, 10215, 1895, 10153, 10178, 10459, 1892, 10406, 10041, 10128, 10090


Best disrupted: 10033, 10448
Best protected: 10164, 10197, 10215, 10041, 1895, 10459, 1892, 10182


Best disrupted: 10094, 855
Best protected: 10164, 10182, 1895, 10178, 10153, 10448, 10215, 10128, 10459, 1892, 10406, 10090


Best disrupted: 10042, 1893, 10033, 855, 10131, 10369
Best protected: 10459, 10182, 10041, 10153, 10164, 1892, 10406, 10197, 10215, 10128, 10178, 10173, 10448, 1895, 10094


Best disrupted: 10173
Best protected: 10197, 10182, 1892, 10041, 10406, 10094, 10215


Best disrupted: 10042, 1893, 10197, 10128, 10090, 855, 10369, 1892, 10094
Best protected: 10041, 10182, 10459, 10164, 10448, 10153, 10178, 10033, 10215, 10173, 1895, 10406, 10131


Best disrupted: 10153
Best protected: 10197


Best disrupted: 10042, 10033, 1893, 10131, 855, 10448, 10369
Best protected: 10459, 10182, 1892, 10153, 10041, 10197, 10406, 10164, 10128, 10215, 10090, 10178, 10173, 10094, 1895


Best disrupted: 10173
Best protected: 10197, 10182, 1892, 10041, 10406, 10153, 10094, 10215


Best disrupted: 10094, 10033, 855
Best protected: 10164, 10182, 10197, 10215, 1895, 10153, 10178, 10459, 1892, 10406, 10041, 10128, 10090


Best disrupted: 10153
Best protected: 10197, 10406


Best disrupted: 10033, 10448
Best protected: 10164, 10197, 10215, 10041, 1895, 10459, 1892, 10182


Best disrupted: 10153
Best protected: 10197, 10406


Best disrupted: 10094, 855
Best protected: 10164, 10182, 1895, 10178, 10153, 10448, 10215, 10128, 10459, 1892, 10406, 10090


Best disrupted: 
Best protected: 


# ExomeCopy

# cn.mops

# TODO

* gene set analysis