Let's see what's going on with this child: CLAI_400178.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True)
%matplotlib inline

In [2]:
# figure out who is who in each trio
import glob
ped_file = '/data/sudregp/cnv/simplex.ped'
wes_prefix = ['CLIA', 'CCGO', 'WPS']
trios = []
affected = []
controls = []
samples = []
famids = []
fid = open(ped_file, 'r')
for line in fid:
    famid, sid, fa, mo, sex, aff = line.rstrip().split('\t')
    if fa[:4] in wes_prefix and mo[:4] in wes_prefix and sid[:4] in wes_prefix:
        fam = {}
        fam['child'] = sid
        if aff == '1':
            affected.append(sid)
        else:
            controls.append(sid)
        fam['father'] = fa
        fam['mother'] = mo
        fam['famid'] = famid
        trios.append(fam)
        samples += [sid, fa, mo]
        famids.append(famid)
fid.close()
samples = set(samples)
famids = set(famids)

fid = open('/data/sudregp/cnv/kid_samples.txt', 'r')
good_kids = [line.rstrip() for line in fid]
fid.close()

In [3]:
def plot_plink_cnvs(fname, t_str, verbose=False):
    
    import plotly.graph_objs as go
    from plotly import tools

    df = pd.read_table(fname, delimiter='\s+', index_col=1)
    df.head()

    x_red, x_green, red, green, xticks = [], [], [], [], []
    red_text, green_text = [], []

    # loop through families
    f = 0
    for fam in famids:
        fam_kids = [t['child'] for t in trios if t['famid'] == fam]
        found = False
        for kid in fam_kids:
            if kid in good_kids:
                found = True
                idx = [i for i in range(df.shape[0]) if df.index[i].find(kid) >= 0][0]
                if kid in affected:
                    red.append(int(df.iloc[idx]['NSEG']))
                    x_red.append(f)
                    red_text.append(kid)
                else:
                    green.append(int(df.iloc[idx]['NSEG']))
                    x_green.append(f)
                    green_text.append(kid)
        # only increase counter if we added a kid
        if found:
            xticks.append(fam)
            f += 1

    fig = go.Figure()
                
    trace0 = go.Scatter(
        x = x_red,
        y = red,
        mode = 'markers',
        name = 'affected',
        marker = dict(size = 10, color = 'red'),
        text = red_text,
        hoverinfo='text+y',
        showlegend = True
    )
    trace1 = go.Scatter(
        x = x_green,
        y = green,
        mode = 'markers',
        name = 'unaffected',
        marker = dict(size = 10, color = 'green'),
        hovertext = green_text,
        hoverinfo='text+y',
        showlegend = True
    )
    fig['data'] = [trace0, trace1]
    fig['layout'].update(height=400, width=800, title=t_str,
                             xaxis1=dict(tickvals=range(len(xticks)),
                                        ticktext=xticks,
                                        zeroline = False),
                             hovermode='closest')
    iplot(fig)
    
    # print family order
    if verbose:
        score = []
        for x in range(max(x_green + x_red)):
            idx = [i for i, val in enumerate(x_red) if val == x]
            if len(idx) == 0:
                aff = 0
            else:
                aff = red[idx[0]]
            idx = [i for i, val in enumerate(x_green) if val == x]
            if len(idx) == 0:
                unaff = 0
            else:
                unaff = max([green[i] for i in idx])
            score.append(aff - unaff)
        order = np.argsort(score)[::-1]
        fam_names = fig.layout.xaxis1['ticktext']
        disrupted = [fam_names[v] for v in order if score[v] > 0]
        print 'Best disrupted:', ', '.join(disrupted)
        protected = [fam_names[v] for v in order[::-1] if score[v] < 0]
        print 'Best protected:', ', '.join(protected)

In [12]:
import glob

data_dir = '/home/sudregp/data/cnv/xhmm/'
files = glob.glob(data_dir + '/denovo*q60_brain*cnv.indiv')
for f in files:
    t_str = '.'.join(f.split('/')[-1].split('.')[:-2])
    plot_plink_cnvs(f, t_str, verbose=True)
files = glob.glob(data_dir + '/denovo*q60_genes*cnv.indiv')
for f in files:
    t_str = '.'.join(f.split('/')[-1].split('.')[:-2])
    plot_plink_cnvs(f, t_str, verbose=True)

Best disrupted: 10406, 10197
Best protected: 10041, 10215


Best disrupted: 10406, 10042, 10090, 10033, 10173, 10197, 1893, 10128, 1895, 10178, 10182, 10164, 10369
Best protected: 10094, 10131, 10448, 10041, 855, 10215, 1892, 10153


First, what are the 3 brain genes affected? And where are they expressed?

In [15]:
%%bash
grep -B 2 CLIA_400178 /home/sudregp/data/cnv/xhmm/denovo_q60_brainGenes.reg
grep -B 2 CLIA_400129 /home/sudregp/data/cnv/xhmm/denovo_q60_brainGenes.reg
grep -B 2 CLIA_400123 /home/sudregp/data/cnv/xhmm/denovo_q50_brainGenes.reg

RANGE (+/- 0kb )  [ 4 158141735 158287226 GRIA2 ]
 FID           IID      PHE  CHR          BP1          BP2   TYPE       KB     OLAP   OLAP_U   OLAP_R
   1   CLIA_400178        2    4    158284906    158289677    DEL    4.771   0.4864  0.01569  0.01595
--
RANGE (+/- 0kb )  [ 7 119913721 120390387 KCND2 ]
 FID           IID      PHE  CHR          BP1          BP2   TYPE       KB     OLAP   OLAP_U   OLAP_R
   1   CLIA_400178        2    7    120388252    120427537    DEL    39.28  0.05437 0.004157 0.004481
--
RANGE (+/- 0kb )  [ 10 89623194 89728532 PTEN ]
 FID           IID      PHE  CHR          BP1          BP2   TYPE       KB     OLAP   OLAP_U   OLAP_R
   1   CLIA_400178        2   10     89690723     89727498    DEL    36.77        1   0.3491   0.3491
RANGE (+/- 0kb )  [ 7 74071990 74175022 GTF2I ]
 FID           IID      PHE  CHR          BP1          BP2   TYPE       KB     OLAP   OLAP_U   OLAP_R
   1   CLIA_400129        2    7     74157725     74198046    DUP    40.32    0.429 

From http://human.brain-map.org/microarray/search/show?exact_match=false&search_term=GRIA2&search_type=gene, we have to establish some thresholdhs to be able to single out brain regions. Also, it's possible to do it through their API, but it'll take a lot of work, so I'm not sure if that's where we should be spending time now.

Let's see how robust this is across XHMM quality controls:

In [13]:
import glob

data_dir = '/home/sudregp/data/cnv/xhmm/'
files = glob.glob(data_dir + '/denovo*q?0_brain*cnv.indiv')
for f in files:
    t_str = '.'.join(f.split('/')[-1].split('.')[:-2])
    plot_plink_cnvs(f, t_str, verbose=True)


Best disrupted: 10406, 10090, 10197
Best protected: 10041, 10448, 10215, 1895


Best disrupted: 10406
Best protected: 


Best disrupted: 10406
Best protected: 10041


Best disrupted: 10406, 10197
Best protected: 10041, 10215


Best disrupted: 10406
Best protected: 10041, 10215


In [14]:
%%bash
grep -B 2 CLIA_400178 /home/sudregp/data/cnv/xhmm/denovo_q90_brainGenes.reg


RANGE (+/- 0kb )  [ 4 158141735 158287226 GRIA2 ]
 FID           IID      PHE  CHR          BP1          BP2   TYPE       KB     OLAP   OLAP_U   OLAP_R
   1   CLIA_400178        2    4    158284906    158289677    DEL    4.771   0.4864  0.01569  0.01595


Interestly, it's quite high quality. It only goes again when we remove the stuff in the clean list. Should check later why. 

I'll run the idea of intersecting two brain maps (expression and pvalues for brain metric), and see what he thinks.