Look at the notebook from 01/02 for th results using Conifer. HEre are the others that were still running:

# Filtering variants

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True)
%matplotlib inline

In [3]:
# figure out who is who in each trio
import glob
ped_file = '/data/sudregp/cnv/simplex.ped'
wes_prefix = ['CLIA', 'CCGO', 'WPS']
trios = []
affected = []
controls = []
samples = []
famids = []
fid = open(ped_file, 'r')
for line in fid:
    famid, sid, fa, mo, sex, aff = line.rstrip().split('\t')
    if fa[:4] in wes_prefix and mo[:4] in wes_prefix and sid[:4] in wes_prefix:
        fam = {}
        fam['child'] = sid
        if aff == '1':
            affected.append(sid)
        else:
            controls.append(sid)
        fam['father'] = fa
        fam['mother'] = mo
        fam['famid'] = famid
        trios.append(fam)
        samples += [sid, fa, mo]
        famids.append(famid)
fid.close()
samples = set(samples)
famids = set(famids)

fid = open('/data/sudregp/cnv/kid_samples.txt', 'r')
good_kids = [line.rstrip() for line in fid]
fid.close()

In [4]:
def classify_cnvs(cnvs, trios):
    inherited = {}
    denovo = {}
    for trio in trios:
        inherited[trio['child']] = []
        denovo[trio['child']] = []

    # need to filter in chr and type first because ranges repeat for every chr,
    # and inheritance is only true for the same type of CNV
    for chr in ['chr%d' % d for d in range(1, 23)]:
        for cnvtype in ['dup', 'del']:
            for trio in trios:
                kid = [cnv for cnv in cnvs[trio['child']] if (cnv[0] == chr and
                                                              cnv[3] == cnvtype)]
                dad = [cnv for cnv in cnvs[trio['father']] if (cnv[0] == chr and
                                                              cnv[3] == cnvtype)]
                mom = [cnv for cnv in cnvs[trio['mother']] if (cnv[0] == chr and
                                                              cnv[3] == cnvtype)]
                for cnv in kid:
                    is_inherited = False
                    # if CNV start is not within any parents boundary AND
                    #         CNV end is not within any parents boundary
                    # it's denovo
                    for pcnv in dad + mom:
                        if ((cnv[1] >= pcnv[1] or cnv[1] <= pcnv[2]) or
                            (cnv[2] >= pcnv[1] or cnv[2] <= pcnv[2])):
                            is_inherited = True
                    if is_inherited:
                        inherited[trio['child']].append(cnv)
                    else:
                        denovo[trio['child']].append(cnv)
    return denovo, inherited

In [5]:
def write2plink(data, fname, trios):
    fid = open(fname, 'w')
    fid.write('FID\tIID\tCHR\tBP1\tBP2\tTYPE\tSCORE\tSITE\n')
    for sample, cnvs in data.iteritems():
        famid = [trio['famid'] for trio in trios if trio['child'] == sample][0]
        for cnv in cnvs:
            ch = cnv[0].replace('chr', '')
            if cnv[3] == 'del':
                cnvt = '1'
            else:
                cnvt = '3'
            if len(cnv) > 4:
                score = cnv[5]
                site = cnv[4]
            else:
                score, site = ['0', '0']
            fid.write('\t'.join([famid, sample, ch, cnv[1], cnv[2], cnvt,
                                 score, site]) + '\n')
    fid.close()

In [6]:
def plot_plink_cnvs(fname, t_str, verbose=False):
    
    import plotly.graph_objs as go
    from plotly import tools

    df = pd.read_table(fname, delimiter='\s+', index_col=1)
    df.head()

    x_red, x_green, red, green, xticks = [], [], [], [], []
    red_text, green_text = [], []

    # loop through families
    f = 0
    for fam in famids:
        fam_kids = [t['child'] for t in trios if t['famid'] == fam]
        found = False
        for kid in fam_kids:
            if kid in good_kids:
                found = True
                idx = [i for i in range(df.shape[0]) if df.index[i].find(kid) >= 0][0]
                if kid in affected:
                    red.append(int(df.iloc[idx]['NSEG']))
                    x_red.append(f)
                    red_text.append(kid)
                else:
                    green.append(int(df.iloc[idx]['NSEG']))
                    x_green.append(f)
                    green_text.append(kid)
        # only increase counter if we added a kid
        if found:
            xticks.append(fam)
            f += 1

    fig = go.Figure()
                
    trace0 = go.Scatter(
        x = x_red,
        y = red,
        mode = 'markers',
        name = 'affected',
        marker = dict(size = 10, color = 'red'),
        text = red_text,
        hoverinfo='text+y',
        showlegend = True
    )
    trace1 = go.Scatter(
        x = x_green,
        y = green,
        mode = 'markers',
        name = 'unaffected',
        marker = dict(size = 10, color = 'green'),
        hovertext = green_text,
        hoverinfo='text+y',
        showlegend = True
    )
    fig['data'] = [trace0, trace1]
    fig['layout'].update(height=400, width=800, title=t_str,
                             xaxis1=dict(tickvals=range(len(xticks)),
                                        ticktext=xticks,
                                        zeroline = False),
                             hovermode='closest')
    iplot(fig)
    
    # print family order
    if verbose:
        score = []
        for x in range(max(x_green + x_red)):
            idx = [i for i, val in enumerate(x_red) if val == x]
            if len(idx) == 0:
                aff = 0
            else:
                aff = red[idx[0]]
            idx = [i for i, val in enumerate(x_green) if val == x]
            if len(idx) == 0:
                unaff = 0
            else:
                unaff = max([green[i] for i in idx])
            score.append(aff - unaff)
        order = np.argsort(score)[::-1]
        fam_names = fig.layout.xaxis1['ticktext']
        disrupted = [fam_names[v] for v in order if score[v] > 0]
        print 'Best disrupted:', ', '.join(disrupted)
        protected = [fam_names[v] for v in order[::-1] if score[v] < 0]
        print 'Best protected:', ', '.join(protected)

# ExomeCopy

In [7]:
fid = open('/data/sudregp/cnv/exomeCopy/calls.txt', 'r')
cnvs = {}
for sample in samples:
    cnvs[sample] = []
for line in fid:
    if line.find('seqnames') < 0:
        clean_line = line.replace('"', '').rstrip()
        chr, start, end, site, t1, cnvtype, score, t2, t3, sample = clean_line.split(',')
        if int(cnvtype) > 2:
            cnvtype = 'dup'
        else:
            cnvtype = 'del'
        cnvs[sample].append([chr, start, end, cnvtype, score, site])
fid.close()

In [8]:
tool = 'exomeCopy'
dn, inh = classify_cnvs(cnvs, trios)

# exomeCopy generated too many inherited CNVs. Let's play only with denovos for now.
write2plink(dn, '/data/sudregp/cnv/%s/denovo.cnv' % tool, trios)
write2plink(inh, '/data/sudregp/cnv/%s/inherited.cnv' % tool, trios)
tmp_cnv = {}
for sample in cnvs.iterkeys():
    if sample in affected + controls:
        tmp_cnv[sample] = cnvs[sample]
write2plink(tmp_cnv, '/data/sudregp/cnv/%s/all.cnv' % tool, trios)

In [9]:
%%bash

module load plink/1.07
tool=exomeCopy
cd /data/sudregp/cnv/${tool}/
cat ../penncnv/wellknown_region_hg19 > bad_regions.list
cp ../penncnv/genes.txt ../penncnv/hg19_allenBrainGene_trimmed.txt ../penncnv/glist-hg19 .

# exomeCopy generated too many inherited CNVs. Let's play only with denovos for now.
for cnvtype in denovo inherited all; do
    cnvname=${cnvtype}.cnv
    plink --cnv-list $cnvname --cnv-make-map --noweb --out ${cnvtype};
    
    # remove bad regions
    plink --map ${cnvname}.map --fam ../simplex.ped --cnv-list $cnvname \
        --noweb --1 --cnv-exclude bad_regions.list --cnv-overlap .5 \
        --cnv-write --out ${cnvtype}_clean
    plink --cnv-list ${cnvtype}_clean.cnv --cnv-make-map --noweb --1 \
        --out ${cnvtype}_clean
    
    for qc in '' '_clean'; do
        cnvname=${cnvtype}${qc}.cnv
        # whole burden
        plink --map ${cnvname}.map --fam ../simplex.ped --cnv-list $cnvname \
            --noweb --1 --cnv-check-no-overlap --out ${cnvtype}${qc}_burden;
        # gene sets
        plink --map ${cnvname}.map --fam ../simplex.ped --cnv-list $cnvname \
            --noweb --1 --cnv-intersect glist-hg19 --cnv-verbose-report-regions \
            --cnv-subset genes.txt --out ${cnvtype}${qc}_genes;
        plink --map ${cnvname}.map --fam ../simplex.ped --cnv-list $cnvname \
            --noweb --1 --cnv-intersect glist-hg19 --cnv-verbose-report-regions \
            --cnv-subset hg19_allenBrainGene_trimmed.txt \
            --out ${cnvtype}${qc}_brainGenes;
        # subtypes only
        for sub in del dup; do
            plink --map ${cnvname}.map --fam ../simplex.ped --cnv-list $cnvname \
            --noweb --1 --cnv-${sub} --out ${cnvtype}${qc}_${sub}Burden;
            # gene sets
            plink --map ${cnvname}.map --fam ../simplex.ped --cnv-list $cnvname \
                --noweb --1 --cnv-intersect glist-hg19 --cnv-verbose-report-regions \
                --cnv-subset genes.txt --cnv-${sub} \
                --out ${cnvtype}${qc}_${sub}Genes;
            plink --map ${cnvname}.map --fam ../simplex.ped --cnv-list $cnvname \
                --noweb --1 --cnv-intersect glist-hg19 --cnv-verbose-report-regions \
                --cnv-subset hg19_allenBrainGene_trimmed.txt --cnv-${sub} \
                --out ${cnvtype}${qc}_${sub}BrainGenes;
        done;
    done;
done

Process is interrupted.


Now we just do some plotting:

In [29]:
import glob

tool = 'exomeCopy'
data_dir = '/home/sudregp/data/cnv/%s/' % tool
files = glob.glob(data_dir + '/denovo*cnv.indiv')
for f in files:
    t_str = '.'.join(f.split('/')[-1].split('.')[:-2])
    plot_plink_cnvs(f, t_str, verbose=True)

Best disrupted: 10197, 10042, 855, 10406
Best protected: 10459, 10164, 1893, 10448, 10131, 1892


Best disrupted: 10042, 855, 10406
Best protected: 1893, 10448, 10131, 1892


Best disrupted: 10197, 855
Best protected: 10459, 10164, 10406


Best disrupted: 
Best protected: 10164, 10459, 1893


Best disrupted: 
Best protected: 1893


Best disrupted: 
Best protected: 10164, 10459


Best disrupted: 10197, 855, 10042, 10406
Best protected: 10459, 10164, 1893, 10448


Best disrupted: 
Best protected: 1893


Best disrupted: 855, 10042, 10406
Best protected: 1893, 10448


Best disrupted: 
Best protected: 1893


Best disrupted: 10197, 855
Best protected: 10459, 10164


Best disrupted: 
Best protected: 


Best disrupted: 
Best protected: 10164, 10459, 1893


Best disrupted: 
Best protected: 


Best disrupted: 
Best protected: 1893


Best disrupted: 
Best protected: 


Best disrupted: 
Best protected: 10164, 10459


Best disrupted: 
Best protected: 


# cn.mops

In [30]:
fid = open('/data/sudregp/cnv/cn.mops/cnvs.csv', 'r')
cnvs = {}
for sample in samples:
    cnvs[sample] = []
for line in fid:
    if line.find('seqnames') < 0:
        clean_line = line.replace('"', '').rstrip()
        chr, start, end, site, t1, sample, t2, t3, cnvtype = clean_line.split(',')
        if cnvtype.find('0') or cnvtype.find('1'):
            cnvtype = 'dup'
        else:
            cnvtype = 'del'
        cnvs[sample.split('.')[0]].append([chr, start, end, cnvtype, '0', site])
fid.close()

In [31]:
tool = 'cn.mops'
dn, inh = classify_cnvs(cnvs, trios)
write2plink(dn, '/data/sudregp/cnv/%s/denovo.cnv' % tool, trios)
write2plink(inh, '/data/sudregp/cnv/%s/inherited.cnv' % tool, trios)
tmp_cnv = {}
for sample in cnvs.iterkeys():
    if sample in affected + controls:
        tmp_cnv[sample] = cnvs[sample]
write2plink(tmp_cnv, '/data/sudregp/cnv/%s/all.cnv' % tool, trios)

In [32]:
%%bash

module load plink/1.07
tool=cn.mops
cd /data/sudregp/cnv/${tool}/
cat ../penncnv/wellknown_region_hg19 > bad_regions.list
cp ../penncnv/genes.txt ../penncnv/hg19_allenBrainGene_trimmed.txt ../penncnv/glist-hg19 .

# exomeCopy generated too many inherited CNVs. Let's play only with denovos for now.
for cnvtype in denovo inherited all; do
    cnvname=${cnvtype}.cnv
    plink --cnv-list $cnvname --cnv-make-map --noweb --out ${cnvtype};
    
    # remove bad regions
    plink --map ${cnvname}.map --fam ../simplex.ped --cnv-list $cnvname \
        --noweb --1 --cnv-exclude bad_regions.list --cnv-overlap .5 \
        --cnv-write --out ${cnvtype}_clean
    plink --cnv-list ${cnvtype}_clean.cnv --cnv-make-map --noweb --1 \
        --out ${cnvtype}_clean
    
    for qc in '' '_clean'; do
        cnvname=${cnvtype}${qc}.cnv
        # whole burden
        plink --map ${cnvname}.map --fam ../simplex.ped --cnv-list $cnvname \
            --noweb --1 --cnv-check-no-overlap --out ${cnvtype}${qc}_burden;
        # gene sets
        plink --map ${cnvname}.map --fam ../simplex.ped --cnv-list $cnvname \
            --noweb --1 --cnv-intersect glist-hg19 --cnv-verbose-report-regions \
            --cnv-subset genes.txt --out ${cnvtype}${qc}_genes;
        plink --map ${cnvname}.map --fam ../simplex.ped --cnv-list $cnvname \
            --noweb --1 --cnv-intersect glist-hg19 --cnv-verbose-report-regions \
            --cnv-subset hg19_allenBrainGene_trimmed.txt \
            --out ${cnvtype}${qc}_brainGenes;
        # subtypes only
        for sub in del dup; do
            plink --map ${cnvname}.map --fam ../simplex.ped --cnv-list $cnvname \
            --noweb --1 --cnv-${sub} --out ${cnvtype}${qc}_${sub}Burden;
            # gene sets
            plink --map ${cnvname}.map --fam ../simplex.ped --cnv-list $cnvname \
                --noweb --1 --cnv-intersect glist-hg19 --cnv-verbose-report-regions \
                --cnv-subset genes.txt --cnv-${sub} \
                --out ${cnvtype}${qc}_${sub}Genes;
            plink --map ${cnvname}.map --fam ../simplex.ped --cnv-list $cnvname \
                --noweb --1 --cnv-intersect glist-hg19 --cnv-verbose-report-regions \
                --cnv-subset hg19_allenBrainGene_trimmed.txt --cnv-${sub} \
                --out ${cnvtype}${qc}_${sub}BrainGenes;
        done;
    done;
done


@----------------------------------------------------------@
|        PLINK!       |     v1.07      |   10/Aug/2009     |
|----------------------------------------------------------|
|  (C) 2009 Shaun Purcell, GNU General Public License, v2  |
|----------------------------------------------------------|
|  For documentation, citation & bug-report instructions:  |
|        http://pngu.mgh.harvard.edu/purcell/plink/        |
@----------------------------------------------------------@

Skipping web check... [ --noweb ] 
Writing this text to log file [ denovo.log ]
Analysis started: Wed Jan  3 15:30:30 2018

Options in effect:
	--cnv-list denovo.cnv
	--cnv-make-map
	--noweb
	--out denovo


Reading segment list (CNVs) from [ denovo.cnv ]
Writing new MAP file to [ denovo.cnv.map ]
Wrote 60 unique positions to file

Analysis finished: Wed Jan  3 15:30:30 2018


@----------------------------------------------------------@
|        PLINK!       |     v1.07      |   10/Aug/2009     |
|--------

In [33]:
import glob

tool = 'cn.mops'
data_dir = '/home/sudregp/data/cnv/%s/' % tool
files = glob.glob(data_dir + '/denovo*cnv.indiv')
for f in files:
    t_str = '.'.join(f.split('/')[-1].split('.')[:-2])
    plot_plink_cnvs(f, t_str, verbose=True)

Best disrupted: 10178, 10094, 10197, 10164, 10406, 10090
Best protected: 10033, 10153, 10215, 10131


Best disrupted: 10178, 10094, 10164, 10406, 10090
Best protected: 10033, 10153, 10131


Best disrupted: 
Best protected: 


Best disrupted: 
Best protected: 


Best disrupted: 
Best protected: 


Best disrupted: 
Best protected: 


Best disrupted: 10178, 10094, 10197, 10164, 10406, 10090
Best protected: 10033, 10153, 10215, 10131


Best disrupted: 10178, 10094, 10164, 10406, 10090
Best protected: 10033, 10153, 10131


Best disrupted: 
Best protected: 


In [34]:
import glob

tool = 'cn.mops'
data_dir = '/home/sudregp/data/cnv/%s/' % tool
files = glob.glob(data_dir + '/all*cnv.indiv')
for f in files:
    t_str = '.'.join(f.split('/')[-1].split('.')[:-2])
    plot_plink_cnvs(f, t_str, verbose=True)

Best disrupted: 10042, 855, 1892, 10178, 10033, 10094, 10406, 10182, 10153, 10448
Best protected: 10459, 10041, 10369, 10131, 10164, 1893, 10215, 10128, 1895, 10197, 10090


Best disrupted: 10042, 10178, 10033, 855, 1892, 10406, 10448, 10094, 10197
Best protected: 10459, 10369, 10041, 10131, 10164, 10215, 1893, 10128, 1895, 10173, 10090, 10153


Best disrupted: 
Best protected: 10369


Best disrupted: 
Best protected: 


Best disrupted: 
Best protected: 


Best disrupted: 
Best protected: 


Best disrupted: 10042, 855, 1892, 10178, 10033, 10094, 10406, 10182, 10153, 10448
Best protected: 10459, 10041, 10369, 10131, 10164, 1893, 10215, 10128, 1895, 10197, 10090


Best disrupted: 10042, 10178, 10033, 855, 1892, 10406, 10448, 10094, 10197
Best protected: 10459, 10369, 10041, 10131, 10164, 10215, 1893, 10128, 1895, 10173, 10090, 10153


Best disrupted: 
Best protected: 10369


Best disrupted: 855, 10033, 10042, 1892
Best protected: 10369, 10041, 10131, 10164, 10153, 1893, 10178


Best disrupted: 855, 10033, 10042, 1892
Best protected: 10369, 10041, 10131, 10164, 10153, 1893, 10178


Best disrupted: 
Best protected: 10369


Best disrupted: 
Best protected: 


Best disrupted: 
Best protected: 


Best disrupted: 
Best protected: 


Best disrupted: 855, 10033, 10042, 1892
Best protected: 10369, 10041, 10131, 10164, 10153, 1893, 10178


Best disrupted: 855, 10033, 10042, 1892
Best protected: 10369, 10041, 10131, 10164, 10153, 1893, 10178


Best disrupted: 
Best protected: 10369


# Summary

Nothing big pops up on this first look. Maybe we if start tweaking the parameters something might come up. Hard to tell. Gene-based analysis might also work better, but in that case I need to pick a tool first. Let's go with XHMM then, as it seems to be the most well-established one.