Let's go ahead and make a few plots now. I'm thinking one for denovo, one for transmitted, and one for overall, plotting each family in the X axis. Red marker is affected, greens are unaffected.

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True)
%matplotlib inline

In [13]:
# figure out who is who in each trio
import glob
data_dir = '/data/sudregp/cnv/penncnv/'
ped_file = '/data/sudregp/cnv/simplex.ped'
wes_prefix = ['CLIA', 'CCGO', 'WPS']
trios = []
affected = []
controls = []
samples = []
famids = []
fid = open(ped_file, 'r')
for line in fid:
    famid, sid, fa, mo, sex, aff = line.rstrip().split('\t')
    if fa[:4] in wes_prefix and mo[:4] in wes_prefix and sid[:4] in wes_prefix:
        fam = {}
        fam['child'] = sid
        if aff == '1':
            affected.append(sid)
        else:
            controls.append(sid)
        fam['father'] = fa
        fam['mother'] = mo
        fam['famid'] = famid
        trios.append(fam)
        samples += [sid, fa, mo]
        famids.append(famid)
fid.close()
samples = set(samples)
famids = set(famids)

And of course we'll make those plots for the different subsets of CNVs I've been analyzing. Let's start with the entire set of called CVNs, just so we can get the plotting function working.

In [40]:
%%bash
cd /data/sudregp/cnv/penncnv/results

echo "kid,denovo,inherited" > summary_called.csv;
while read sample; do
    triocnv=${sample}'.jointcnv'
    rm denovo.txt inherited.txt 2>/dev/null
    grep mother ${triocnv} > mom_snps;
    grep father ${triocnv} > dad_snps;
    cat mom_snps dad_snps > parent_snps;
    for snp in `grep offspring ${triocnv} | cut -d' ' -f 1`; do
        if ! grep -q $snp parent_snps; then
            echo "$snp" >> denovo.txt
        else
            echo "$snp" >> inherited.txt
        fi;
   done
   echo ${sample},`cat denovo.txt 2>/dev/null | wc -l`,`cat inherited.txt 2>/dev/null | wc -l` >> summary_called.csv;
   rm *_snps;
done < ../good_kids_numCNVse20.txt

In [98]:
import plotly.graph_objs as go
from plotly import tools

df = pd.read_csv('/data/sudregp/cnv/penncnv/results/summary_called.csv')
df['total'] = df.denovo + df.inherited

fig = tools.make_subplots(rows=3, cols=1, subplot_titles=('De Novo CNVs',
                                                          'Inherited CNVs',
                                                          'All CNVs'))

for cnv in ['denovo', 'inherited', 'total']:
    x_red, x_green, red, green, xticks = [], [], [], [], []
    red_text, green_text = [], []

    # loop through families
    f = 0
    for fam in famids:
        fam_kids = [t['child'] for t in trios if t['famid'] == fam]
        found = False
        for kid in fam_kids:
            if kid in list(df.kid):
                found = True
                if kid in affected:
                    red.append(int(df[df.kid == kid][cnv]))
                    x_red.append(f)
                    red_text.append(kid)
                else:
                    green.append(int(df[df.kid == kid][cnv]))
                    x_green.append(f)
                    green_text.append(kid)
        # only increase counter if we added a kid
        if found:
            xticks.append(fam)
            f += 1


    trace0 = go.Scatter(
        x = x_red,
        y = red,
        mode = 'markers',
        name = 'affected',
        marker = dict(size = 10, color = 'red'),
        text = red_text,
        hoverinfo='text',
        hoveron='points'
    )
    trace1 = go.Scatter(
        x = x_green,
        y = green,
        mode = 'markers',
        name = 'unaffected',
        marker = dict(size = 10, color = 'green'),
        hovertext = green_text,
        hoverinfo='text',
        hoveron='points'
    )
    layout = go.Layout(
        xaxis=dict(
            tickvals=range(len(xticks)),
            ticktext=xticks,
            zeroline = False
        ),
        hovermode='closest'
    )

    data = [trace0, trace1]
    iplot({'data': data, 'layout': layout})

One plot looks good, let's try subplots:

In [121]:
import plotly.graph_objs as go
from plotly import tools

df = pd.read_csv('/data/sudregp/cnv/penncnv/results/summary_called.csv')
df['total'] = df.denovo + df.inherited

fig = tools.make_subplots(rows=3, cols=1, subplot_titles=('De Novo CNVs',
                                                          'Inherited CNVs',
                                                          'All CNVs'))

for c, cnv in enumerate(['denovo', 'inherited', 'total']):
    x_red, x_green, red, green, xticks = [], [], [], [], []
    red_text, green_text = [], []

    # loop through families
    f = 0
    for fam in famids:
        fam_kids = [t['child'] for t in trios if t['famid'] == fam]
        found = False
        for kid in fam_kids:
            if kid in list(df.kid):
                found = True
                if kid in affected:
                    red.append(int(df[df.kid == kid][cnv]))
                    x_red.append(f)
                    red_text.append(kid)
                else:
                    green.append(int(df[df.kid == kid][cnv]))
                    x_green.append(f)
                    green_text.append(kid)
        # only increase counter if we added a kid
        if found:
            xticks.append(fam)
            f += 1


    trace0 = go.Scatter(
        x = x_red,
        y = red,
        mode = 'markers',
        name = 'affected',
        marker = dict(size = 10, color = 'red'),
        text = red_text,
        hoverinfo='text+y',
        showlegend = False
    )
    trace1 = go.Scatter(
        x = x_green,
        y = green,
        mode = 'markers',
        name = 'unaffected',
        marker = dict(size = 10, color = 'green'),
        hovertext = green_text,
        hoverinfo='text+y',
        showlegend = False
    )
    fig.append_trace(trace0, c + 1, 1)
    fig.append_trace(trace1, c + 1, 1)

fig['layout'].update(height=900, width=800, title='Called CNVs',
                     xaxis1=dict(tickvals=range(len(xticks)),
                                ticktext=xticks,
                                zeroline = False),
                     xaxis2=dict(tickvals=range(len(xticks)),
                                ticktext=xticks,
                                zeroline = False),
                     xaxis3=dict(tickvals=range(len(xticks)),
                                ticktext=xticks,
                                zeroline = False),
                     hovermode='closest')
fig['data'][0]['showlegend'] = True
fig['data'][1]['showlegend'] = True
iplot(fig)

This is the format of your plot grid:
[ (1,1) x1,y1 ]
[ (2,1) x2,y2 ]
[ (3,1) x3,y3 ]



Great, now let's make it into a function:

In [4]:
def plot_cnvs(fname, t_str):
    import plotly.graph_objs as go
    from plotly import tools

    df = pd.read_csv(fname)
    df['total'] = df.denovo + df.inherited

    fig = tools.make_subplots(rows=3, cols=1, subplot_titles=('De Novo CNVs',
                                                              'Inherited CNVs',
                                                              'All CNVs'))

    for c, cnv in enumerate(['denovo', 'inherited', 'total']):
        x_red, x_green, red, green, xticks = [], [], [], [], []
        red_text, green_text = [], []

        # loop through families
        f = 0
        for fam in famids:
            fam_kids = [t['child'] for t in trios if t['famid'] == fam]
            found = False
            for kid in fam_kids:
                if kid in list(df.kid):
                    found = True
                    if kid in affected:
                        red.append(int(df[df.kid == kid][cnv]))
                        x_red.append(f)
                        red_text.append(kid)
                    else:
                        green.append(int(df[df.kid == kid][cnv]))
                        x_green.append(f)
                        green_text.append(kid)
            # only increase counter if we added a kid
            if found:
                xticks.append(fam)
                f += 1


        trace0 = go.Scatter(
            x = x_red,
            y = red,
            mode = 'markers',
            name = 'affected',
            marker = dict(size = 10, color = 'red'),
            text = red_text,
            hoverinfo='text+y',
            showlegend = False
        )
        trace1 = go.Scatter(
            x = x_green,
            y = green,
            mode = 'markers',
            name = 'unaffected',
            marker = dict(size = 10, color = 'green'),
            hovertext = green_text,
            hoverinfo='text+y',
            showlegend = False
        )
        fig.append_trace(trace0, c + 1, 1)
        fig.append_trace(trace1, c + 1, 1)

    fig['layout'].update(height=900, width=800, title=t_str,
                         xaxis1=dict(tickvals=range(len(xticks)),
                                    ticktext=xticks,
                                    zeroline = False),
                         xaxis2=dict(tickvals=range(len(xticks)),
                                    ticktext=xticks,
                                    zeroline = False),
                         xaxis3=dict(tickvals=range(len(xticks)),
                                    ticktext=xticks,
                                    zeroline = False),
                         hovermode='closest')
    fig['data'][0]['showlegend'] = True
    fig['data'][1]['showlegend'] = True
    iplot(fig)

In [5]:
plot_cnvs('/data/sudregp/cnv/penncnv/results/summary_called.csv',
          'All called (numCNV <= 20)')

This is the format of your plot grid:
[ (1,1) x1,y1 ]
[ (2,1) x2,y2 ]
[ (3,1) x3,y3 ]



OK, so this is working. But how come I have more than 20 CNVs in any category? Isn't that the QC threshold I was playing with?

Well, those QC values were based on rawcnv, not jointcnv. And that's the issue. So, we could use the jointcnv values for QC. Does LRR_SD vary between joint and raw cnvs calls? LRR_SD stands for the variability of Log R Ratio. It is the log of the ratio between the total observed intensity and the expected intensity at each probe. So, yeah, it is the same for both raw and joint calls. However, we do need to use the number of CNVs from the joint calls, which will change the quality data frame.

In [14]:
df = pd.read_table('/data/sudregp/cnv/penncnv/results/all_simplex.qcsum')
# we ran for all samples, but let's look at only stats for samples in our simplex study
keep_me = [i for i in range(df.shape[0]) if df.File[i].split('/')[-1] in samples]
df = df.iloc[keep_me]
df

Unnamed: 0,File,LRR_mean,LRR_median,LRR_SD,BAF_mean,BAF_median,BAF_SD,BAF_drift,WF,NumCNV
1,InfiniumExome/CLIA_400149,-0.0550,0.0,0.3398,0.5041,0.5,0.0555,0.000391,0.0940,404
2,InfiniumExome/CLIA_400190,0.0081,0.0,0.1328,0.5002,0.5,0.0351,0.000000,-0.0160,3
3,InfiniumExome/CLIA_400186,0.0172,0.0,0.1504,0.4993,0.5,0.0383,0.000084,-0.0186,9
4,InfiniumExome/CLIA_400140,0.0022,0.0,0.1338,0.4991,0.5,0.0362,0.000000,-0.0293,2
6,InfiniumExome/CLIA_400138,0.0136,0.0,0.1353,0.4989,0.5,0.0343,0.000121,0.0204,7
7,InfiniumExome/CLIA_400121,0.0073,0.0,0.1409,0.4980,0.5,0.0368,0.000084,-0.0188,2
8,InfiniumExome/CLIA_400172,0.0251,0.0,0.1802,0.4999,0.5,0.0361,0.000078,0.0495,126
9,InfiniumExome/CLIA_400191,0.0103,0.0,0.1322,0.4995,0.5,0.0345,0.000000,-0.0159,7
12,InfiniumExome/CLIA_400148,0.0152,0.0,0.1461,0.4981,0.5,0.0358,0.000000,-0.0186,3
13,InfiniumExome/CLIA_400153,-0.0027,0.0,0.1279,0.5006,0.5,0.0375,0.000133,-0.0156,3


In [9]:
len(samples)

97

# TODO

* recount only the intersecting CNVs
* how about a gene-level burden test, where we count genes with a CNV, rather than cnvs?
* Filter based on neural vs nonneural CNVs (i.e. expressed in the brain)
* Look at literature-only CNVs (J Chia and A Thapaer for CNVs in ADHD)
* Try using quality score from https://www.ncbi.nlm.nih.gov/pubmed/27402902
* Try removing calls in immunoglobin, telomere and centromere regions (see PennCNV annotation page) 
* try merging adjacent CNV calls? (PennCNV can do it)
* how about transforming XHMM calls to Penncnv format to handle everything with similar scripts?
* Worth calculating p-values? For that one trio it was always 0!
* Try PennCNV steps with adjusted pipeline to see if we get anything different
* Look into sex chromosomes? Something to the idea that adhd is more present in boys...
* Play with the minimum size of the CNV
* Play with the HMM parameters
* compile a PFB file for this specific population?
* Check (and plot) within family differences; the ones with big differences are interesting.
* Include parent burden in the analysis
* Match with file of ranked simplex by Wendy (maybe blindly)?
* Do all of the above with XHMM and array data. Maybe start with array because it'll be simpler?

# Useful links

