Let's evaluate the results of our first run with Conifer:

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True)
%matplotlib inline

In [19]:
def classify_cnvs(cnvs, trios):
    inherited = {}
    denovo = {}
    for trio in trios:
        inherited[trio['child']] = []
        denovo[trio['child']] = []

    # need to filter in chr and type first because ranges repeat for every chr,
    # and inheritance is only true for the same type of CNV
    for chr in ['chr%d' % d for d in range(1, 25)]:
        for cnvtype in ['dup', 'del']:
            for trio in trios:
                kid = [cnv for cnv in cnvs[trio['child']] if (cnv[0] == chr and
                                                              cnv[3] == cnvtype)]
                dad = [cnv for cnv in cnvs[trio['father']] if (cnv[0] == chr and
                                                              cnv[3] == cnvtype)]
                mom = [cnv for cnv in cnvs[trio['mother']] if (cnv[0] == chr and
                                                              cnv[3] == cnvtype)]
                for cnv in kid:
                    is_inherited = False
                    # if CNV start is not within any parents boundary AND
                    #         CNV end is not within any parents boundary
                    # it's denovo
                    for pcnv in dad + mom:
                        if ((cnv[1] >= pcnv[1] or cnv[1] <= pcnv[2]) or
                            (cnv[2] >= pcnv[1] or cnv[2] <= pcnv[2])):
                            is_inherited = True
                    if is_inherited:
                        inherited[trio['child']].append(cnv)
                    else:
                        denovo[trio['child']].append(cnv)
    return denovo, inherited


def write2plink(data, fname, trios):
    fid = open(fname, 'w')
    fid.write('FID\tIID\tCHR\tBP1\tBP2\tTYPE\tSCORE\tSITE\n')
    for sample, cnvs in data.iteritems():
        famid = [trio['famid'] for trio in trios if trio['child'] == sample][0]
        for cnv in cnvs:
            ch = cnv[0].replace('chr', '')
            if cnv[3] == 'del':
                cnvt = '1'
            else:
                cnvt = '3'
            if len(cnv) > 4:
                score = cnv[5]
                site = cnv[4]
            else:
                score, site = ['0', '0']
            fid.write('\t'.join([famid, sample, ch, cnv[1], cnv[2], cnvt,
                                 score, site]) + '\n')
    fid.close()

In [58]:
import glob
ped_files = ['/data/sudregp/multiplex_simplex/simplex.ped']
wes_prefix = ['CLIA', 'CCGO', 'WPS']
# fid = open('/home/sudregp/data/multiplex_simplex/samples_simplex_all.txt', 'r')
# exclude_list = [line.rstrip() for line in fid]
# fid.close()

# no controls/affected pair for comparison
exclude_list = ['CLIA_400165', 'CLIA_400164', 'CLIA_400155', 'CLIA_400146',
                'CLIA_400145', 'CLIA_400126', 'CLIA_400207', 'CLIA_400208',
                'CLIA_400209']
# missing one parent
exclude_list += ['CLIA_400169', 'CLIA_400168']
# family 9030
exclude_list += ['CCGO_800978', 'CCGO_800977', 'CCGO_800976', 'CCGO_800979',
                 'CCGO_800980', 'CLIA_400067']

trios = []
affected = []
controls = []
samples = []
famids = []
for ped_file in ped_files:
    fid = open(ped_file, 'r')
    for line in fid:
        famid, sid, fa, mo, sex, aff = line.rstrip().split('\t')
        # if the current ID and its parents have WES data, and the sample is 
        # not in yet
        if (fa.split('_')[0] in wes_prefix and
            mo.split('_')[0] in wes_prefix and
            sid.split('_')[0] in wes_prefix and
            sid not in samples and
            (sid not in exclude_list or fa not in exclude_list or mo not in exclude_list)):
            fam = {}
            fam['child'] = sid
            if aff == '1':
                affected.append(sid)
            else:
                controls.append(sid)
            fam['father'] = fa
            fam['mother'] = mo
            fam['famid'] = famid
            trios.append(fam)
            samples += [sid, fa, mo]
            famids.append(famid)
    fid.close()
samples = set(samples)
famids = set(famids)
kids = set(affected + controls)
good_kids = kids

print 'Unique samples:', len(samples)
print 'Unique families:', len(famids)
print 'Unique children:', len(kids)

Unique samples: 84
Unique families: 19
Unique children: 46


In [65]:
files = glob.glob('/data/sudregp/cnv/conifer84/dnacopy/*_min2undo3SD.txt')

cnvs = {}
for fname in files:
    fid = open(fname, 'r')
    sample = '_'.join(fname.split('/')[-1].split('_')[:2])
    cnvs[sample] = []
    for line in fid:
        chr, start, end, nsites, val = line.rstrip().split('\t')
        if float(val) > 1:
            cnvs[sample].append([chr, start, end, 'dup', '0', nsites])
        elif float(val) < -1:
            cnvs[sample].append([chr, start, end, 'del', '0', nsites])
fid.close()

dn, inh = classify_cnvs(cnvs, trios)
write2plink(dn, '/data/sudregp/cnv/conifer84/dnacopy/denovo.cnv', trios)
write2plink(inh, '/data/sudregp/cnv/conifer84/dnacopy/inherited.cnv', trios)
tmp_cnv = {}
for sample in cnvs.iterkeys():
    if sample in affected + controls:
        tmp_cnv[sample] = cnvs[sample]
write2plink(tmp_cnv, '/data/sudregp/cnv/conifer84/dnacopy/all.cnv', trios)

And do some filtering in PLINK:

In [None]:
%%bash

module load plink/1.07
cd /data/sudregp/cnv/conifer84/dnacopy/
cat ../../penncnv/wellknown_region_hg19 > bad_regions.list
cp ../../penncnv/genes.txt ../../penncnv/hg19_allenBrainGene_trimmed.txt ../../penncnv/glist-hg19 .

# exomeCopy generated too many inherited CNVs. Let's play only with denovos for now.
for cnvtype in denovo inherited all; do
    cnvname=${cnvtype}.cnv
    plink --cnv-list $cnvname --cnv-make-map --noweb --out ${cnvtype};
    
    # remove bad regions
    plink --map ${cnvname}.map --fam ../../simplex.ped --cnv-list $cnvname \
        --noweb --1 --cnv-exclude bad_regions.list --cnv-overlap .5 \
        --cnv-write --out ${cnvtype}_clean
    plink --cnv-list ${cnvtype}_clean.cnv --cnv-make-map --noweb --1 \
        --out ${cnvtype}_clean
    
    for qc in '' '_clean'; do
        cnvname=${cnvtype}${qc}.cnv
        # whole burden
        plink --map ${cnvname}.map --fam ../../simplex.ped --cnv-list $cnvname \
            --noweb --1 --cnv-check-no-overlap --out ${cnvtype}${qc}_burden;
        # gene sets
        plink --map ${cnvname}.map --fam ../../simplex.ped --cnv-list $cnvname \
            --noweb --1 --cnv-intersect glist-hg19 --cnv-verbose-report-regions \
            --cnv-subset genes.txt --out ${cnvtype}${qc}_genes;
        plink --map ${cnvname}.map --fam ../../simplex.ped --cnv-list $cnvname \
            --noweb --1 --cnv-intersect glist-hg19 --cnv-verbose-report-regions \
            --cnv-subset hg19_allenBrainGene_trimmed.txt \
            --out ${cnvtype}${qc}_brainGenes;
        # subtypes only
        for sub in del dup; do
            plink --map ${cnvname}.map --fam ../../simplex.ped --cnv-list $cnvname \
            --noweb --1 --cnv-${sub} --out ${cnvtype}${qc}_${sub}Burden;
            # gene sets
            plink --map ${cnvname}.map --fam ../../simplex.ped --cnv-list $cnvname \
                --noweb --1 --cnv-intersect glist-hg19 --cnv-verbose-report-regions \
                --cnv-subset genes.txt --cnv-${sub} \
                --out ${cnvtype}${qc}_${sub}Genes;
            plink --map ${cnvname}.map --fam ../../simplex.ped --cnv-list $cnvname \
                --noweb --1 --cnv-intersect glist-hg19 --cnv-verbose-report-regions \
                --cnv-subset hg19_allenBrainGene_trimmed.txt --cnv-${sub} \
                --out ${cnvtype}${qc}_${sub}BrainGenes;
        done;
    done;
done

In [22]:
%%bash
cd ~/data/cnv/conifer84/dnacopy
cat denovo_clean.cnv | awk '{ if (($5-$4)/1000 > 0) { print $_, ($5-$4)/1000 } }'

   1976   CLIA_400201   11     56268111     56294363      3            4        0  26.252
   1976   CLIA_400201   24      2595249      2605951      3            3        0  10.702
   1976   CLIA_400204    7    142045736    142104257      1            6        0  58.521
   1976   CLIA_400204   11     46879905     46880550      3            3        0  0.645
  10182   CLIA_400170   21     10215318     11097543      3           52        0  882.225
  10182   CLIA_400172   12        90972       148948      3            3        0  57.976
  10153   CLIA_400212   13     50276470     50280320      3            4        0  3.85
  10153   CLIA_400128   15     30871436     32403974      1          215        0  1532.54
   1893   CLIA_400163   18     47088629     47093751      3            3        0  5.122
  10094   CLIA_400144    3    160473985    160474267      3            3        0  0.282
  10094   CLIA_400144   20     26188761     29578897      1            4        0  3390.14


In [28]:
print 'CLIA_400204' in affected
print 'CLIA_400128' in affected
print 'CLIA_400144' in affected

True
True
False


Nothing good here... length restriction shows no pattern, nor does deletion/duplication subsets.

In [36]:
%%bash
cd ~/data/cnv/conifer84/dnacopy
cat denovo.cnv | awk '{ if (($5-$4)/1000 > 50) { print $_, ($5-$4)/1000 } }'

1976	CLIA_400203	6	170923465	171054620	1	6	0 131.155
1976	CLIA_400204	4	49199159	49561533	3	10	0 362.374
1976	CLIA_400204	7	142045736	142104257	1	6	0 58.521
10182	CLIA_400170	21	10215318	11097543	3	52	0 882.225
10182	CLIA_400172	12	90972	148948	3	3	0 57.976
1892	CLIA_400175	1	152488400	152552015	1	3	0 63.615
1895	CLIA_400158	22	25627551	25853158	3	19	0 225.607
10094	CLIA_400130	19	43269599	43716465	3	93	0 446.866
855	CLIA_400133	16	34510752	34681899	3	4	0 171.147
10131	CLIA_400139	21	44592216	44837422	3	9	0 245.206
10131	CLIA_400138	8	7450205	7563023	3	3	0 112.818
10164	CLIA_400122	24	23563343	23655335	3	2	0 91.992
10153	CLIA_400210	16	34417097	34681899	3	5	0 264.802
10033	CLIA_400166	14	106091549	106174157	3	31	0 82.608
10153	CLIA_400128	15	30871436	32403974	1	215	0 1532.54
10153	CLIA_400128	23	3747103	3819904	3	10	0 72.801
10197	CLIA_400129	14	106573381	106791034	1	37	0 217.653
10197	CLIA_400129	17	20641144	20744351	3	3	0 103.207
1893	CLIA_400162	22	25627551	25853158	3	19	0 225.607
1

In [52]:
nlen = 200
cnv_types = ['1']
fid = open('/data/sudregp/cnv/conifer84/dnacopy/denovo.cnv', 'r')

aff_cnt, ctl_cnt = 0, 0
for line in fid:
    famid, sample, chr, start, end, cnvtype, a, b = line.rstrip().split('\t')
    if (line.find('BP') < 0 and (int(end) - int(start))/1000 > nlen and
        cnvtype in cnv_types):
        if sample in affected:
            aff_cnt += 1
        else:
            ctl_cnt += 1
print 'Affected: %d, controls: %d' % (aff_cnt, ctl_cnt)

Affected: 2, controls: 1


Tried some different combination in the cell above, but nothing poped up. Let's try the gene-based analysis:

In [53]:
fid = open('/home/sudregp/data/cnv/conifer84/dnacopy/denovo_clean_genes.reg')
genes = {}
for line in fid:
    if line.find('RANGE') >= 0:
        gene = line.split(' ')[-2]
        genes[gene] = 0
    elif line.find('DUP') > 0 or line.find('DEL') > 0:
        genes[gene] += 1
fid.close()
df = pd.DataFrame.from_dict(genes, orient='index')
df.columns = ['count']
df.sort_values(by='count', ascending=False).head(10)

Unnamed: 0,count
MIR663A,1
CHRNA7,1
ARHGAP11B,1
ULK4P1,1
KPNA3,1
MIR211,1
CD99,1
LRP4,1
LRP4-AS1,1
LOC284801,1


Nothing there either...

In [55]:
fid = open('/home/sudregp/data/cnv/conifer84/dnacopy/denovo_genes.reg')
genes = {}
for line in fid:
    if line.find('RANGE') >= 0:
        gene = line.split(' ')[-2]
        genes[gene] = 0
    elif line.find('DUP') > 0 or line.find('DEL') > 0:
        genes[gene] += 1
fid.close()
df = pd.DataFrame.from_dict(genes, orient='index')
df.columns = ['count']
df.sort_values(by='count', ascending=False).head(10)

Unnamed: 0,count
PSG4,2
LOC100288637,2
LOC283914,2
KRT33B,2
DMBT1,2
S100B,2
IGLL3P,2
LRP5L,2
DIP2A,2
PSG8,2


Many genes have two kids if we don't clean the CNVs based on frequency. Not necessarily earth shattering...

Let's go ahead and make the plots just for kicks:

In [56]:
def plot_plink_cnvs(fname, t_str, verbose=False):
    
    import plotly.graph_objs as go
    from plotly import tools

    df = pd.read_table(fname, delimiter='\s+', index_col=1)
    df.head()

    x_red, x_green, red, green, xticks = [], [], [], [], []
    red_text, green_text = [], []

    # loop through families
    f = 0
    for fam in famids:
        fam_kids = [t['child'] for t in trios if t['famid'] == fam]
        found = False
        for kid in fam_kids:
            if kid in good_kids:
                found = True
                idx = [i for i in range(df.shape[0]) if df.index[i].find(kid) >= 0][0]
                if kid in affected:
                    red.append(int(df.iloc[idx]['NSEG']))
                    x_red.append(f)
                    red_text.append(kid)
                else:
                    green.append(int(df.iloc[idx]['NSEG']))
                    x_green.append(f)
                    green_text.append(kid)
        # only increase counter if we added a kid
        if found:
            xticks.append(fam)
            f += 1

    fig = go.Figure()
                
    trace0 = go.Scatter(
        x = x_red,
        y = red,
        mode = 'markers',
        name = 'affected',
        marker = dict(size = 10, color = 'red'),
        text = red_text,
        hoverinfo='text+y',
        showlegend = True
    )
    trace1 = go.Scatter(
        x = x_green,
        y = green,
        mode = 'markers',
        name = 'unaffected',
        marker = dict(size = 10, color = 'green'),
        hovertext = green_text,
        hoverinfo='text+y',
        showlegend = True
    )
    fig['data'] = [trace0, trace1]
    fig['layout'].update(height=400, width=800, title=t_str,
                             xaxis1=dict(tickvals=range(len(xticks)),
                                        ticktext=xticks,
                                        zeroline = False),
                             hovermode='closest')
    iplot(fig)
    
    # print family order
    if verbose:
        score = []
        for x in range(max(x_green + x_red)):
            idx = [i for i, val in enumerate(x_red) if val == x]
            if len(idx) == 0:
                aff = 0
            else:
                aff = red[idx[0]]
            idx = [i for i, val in enumerate(x_green) if val == x]
            if len(idx) == 0:
                unaff = 0
            else:
                unaff = max([green[i] for i in idx])
            score.append(aff - unaff)
        order = np.argsort(score)[::-1]
        fam_names = fig.layout.xaxis1['ticktext']
        disrupted = [fam_names[v] for v in order if score[v] > 0]
        print 'Best disrupted:', ', '.join(disrupted)
        protected = [fam_names[v] for v in order[::-1] if score[v] < 0]
        print 'Best protected:', ', '.join(protected)

In [59]:
import glob

data_dir = '/home/sudregp/data/cnv/conifer84/dnacopy/'
files = glob.glob(data_dir + '/*clean*cnv.indiv')
files.sort()
for f in files:
    t_str = '.'.join(f.split('/')[-1].split('.')[:-2])
    plot_plink_cnvs(f, t_str, verbose=True)

Best disrupted: 10153
Best protected: 


Best disrupted: 10178, 10448
Best protected: 10094, 10128, 10182, 1895


Best disrupted: 10153
Best protected: 


Best disrupted: 10448, 1976, 10153
Best protected: 10094


Best disrupted: 10448, 10153
Best protected: 10094


Best disrupted: 
Best protected: 


Best disrupted: 10178, 10448
Best protected: 10128, 10182, 10094, 1976, 1895, 10153


Best disrupted: 10178, 10448
Best protected: 10128, 10182, 10094, 10153, 1895


Best disrupted: 10178, 10448
Best protected: 10094, 10128, 10182, 1895


Best disrupted: 10153
Best protected: 


Best disrupted: 
Best protected: 10094, 10182


Best disrupted: 10153
Best protected: 


Best disrupted: 1976, 10153
Best protected: 10094


Best disrupted: 10153
Best protected: 10094


Best disrupted: 
Best protected: 


Best disrupted: 
Best protected: 10094, 1976, 10182, 10153


Best disrupted: 
Best protected: 10094, 10182, 10153


Best disrupted: 
Best protected: 10094, 10182


Best disrupted: 
Best protected: 


Best disrupted: 10178, 10448
Best protected: 10128, 10094, 1895


Best disrupted: 
Best protected: 


Best disrupted: 10448
Best protected: 10094


Best disrupted: 10448
Best protected: 10094


Best disrupted: 
Best protected: 


Best disrupted: 10178, 10448
Best protected: 10128, 1895


Best disrupted: 10178, 10448
Best protected: 10128, 1895


Best disrupted: 10178, 10448
Best protected: 10128, 10094, 1895


The pictures show what we had seen before: 2 of the trios had denovo deletions in the affected kid, clean data, but one control had it as well. Th nice thing is that the ADHD deletion is in a brain-gene, and that's the only hit. Not sure if it's worth reporting, though. We'd definietely have to check the SNPs for this one.

# No cleaning set

Let's do the same analysis as before, but this time we don't do the extra pruning afterwards. Of course we'll get lots of crappy calls, but hopefully there's a pattern in the noise and we can clear some of it later.

In [None]:
library(DNAcopy)

data_dir = '~/data/cnv/conifer84/export_svdzrpkm/'
out_dir = '~/data/cnv/conifer84/dnacopy/'
files = list.files(path=data_dir, pattern='*bam.rpkm.bed')
for (fname in files) {
    sname = strsplit(x=fname, split = '[.]')[[1]][1]
    df = read.table(paste(data_dir, fname, sep=''))
    CNA.object = CNA(cbind(df$V5), df$V1, df$V2, data.type='logratio',
                     sampleid=sname)
    smoothed.CNA.object <- smooth.CNA(CNA.object)
    res <- segment(smoothed.CNA.object, verbose=1)
    write.table(res$output[,2:6],
                file=paste(out_dir, sprintf('%s_min2.txt', sname), sep=''),
                row.names=F, col.names=F, quote=F, sep="\t")
}

In [66]:
files = glob.glob('/data/sudregp/cnv/conifer84/dnacopy/*_min2.txt')

# we do have much more noise now, so we need to lower our threshold
cnvs = {}
for fname in files:
    fid = open(fname, 'r')
    sample = '_'.join(fname.split('/')[-1].split('_')[:2])
    cnvs[sample] = []
    for line in fid:
        chr, start, end, nsites, val = line.rstrip().split('\t')
        if float(val) > .5:
            cnvs[sample].append([chr, start, end, 'dup', '0', nsites])
        elif float(val) < -.5:
            cnvs[sample].append([chr, start, end, 'del', '0', nsites])
fid.close()

dn, inh = classify_cnvs(cnvs, trios)
write2plink(dn, '/data/sudregp/cnv/conifer84/dnacopy/denovo_noUndo.cnv', trios)
write2plink(inh, '/data/sudregp/cnv/conifer84/dnacopy/inherited_noUndo.cnv', trios)
tmp_cnv = {}
for sample in cnvs.iterkeys():
    if sample in affected + controls:
        tmp_cnv[sample] = cnvs[sample]
write2plink(tmp_cnv, '/data/sudregp/cnv/conifer84/dnacopy/all_noUndo.cnv', trios)

In [67]:
%%bash

module load plink/1.07
cd /data/sudregp/cnv/conifer84/dnacopy/
cat ../../penncnv/wellknown_region_hg19 > bad_regions.list
cp ../../penncnv/genes.txt ../../penncnv/hg19_allenBrainGene_trimmed.txt ../../penncnv/glist-hg19 .

# exomeCopy generated too many inherited CNVs. Let's play only with denovos for now.
for cnvtype in denovo_noUndo inherited_noUndo all_noUndo; do
    cnvname=${cnvtype}.cnv
    plink --cnv-list $cnvname --cnv-make-map --noweb --out ${cnvtype};
    
    # remove bad regions
    plink --map ${cnvname}.map --fam ../../simplex.ped --cnv-list $cnvname \
        --noweb --1 --cnv-exclude bad_regions.list --cnv-overlap .5 \
        --cnv-write --out ${cnvtype}_clean
    plink --cnv-list ${cnvtype}_clean.cnv --cnv-make-map --noweb --1 \
        --out ${cnvtype}_clean
    
    for qc in '' '_clean'; do
        cnvname=${cnvtype}${qc}.cnv
        # whole burden
        plink --map ${cnvname}.map --fam ../../simplex.ped --cnv-list $cnvname \
            --noweb --1 --cnv-check-no-overlap --out ${cnvtype}${qc}_burden;
        # gene sets
        plink --map ${cnvname}.map --fam ../../simplex.ped --cnv-list $cnvname \
            --noweb --1 --cnv-intersect glist-hg19 --cnv-verbose-report-regions \
            --cnv-subset genes.txt --out ${cnvtype}${qc}_genes;
        plink --map ${cnvname}.map --fam ../../simplex.ped --cnv-list $cnvname \
            --noweb --1 --cnv-intersect glist-hg19 --cnv-verbose-report-regions \
            --cnv-subset hg19_allenBrainGene_trimmed.txt \
            --out ${cnvtype}${qc}_brainGenes;
        # subtypes only
        for sub in del dup; do
            plink --map ${cnvname}.map --fam ../../simplex.ped --cnv-list $cnvname \
            --noweb --1 --cnv-${sub} --out ${cnvtype}${qc}_${sub}Burden;
            # gene sets
            plink --map ${cnvname}.map --fam ../../simplex.ped --cnv-list $cnvname \
                --noweb --1 --cnv-intersect glist-hg19 --cnv-verbose-report-regions \
                --cnv-subset genes.txt --cnv-${sub} \
                --out ${cnvtype}${qc}_${sub}Genes;
            plink --map ${cnvname}.map --fam ../../simplex.ped --cnv-list $cnvname \
                --noweb --1 --cnv-intersect glist-hg19 --cnv-verbose-report-regions \
                --cnv-subset hg19_allenBrainGene_trimmed.txt --cnv-${sub} \
                --out ${cnvtype}${qc}_${sub}BrainGenes;
        done;
    done;
done


@----------------------------------------------------------@
|        PLINK!       |     v1.07      |   10/Aug/2009     |
|----------------------------------------------------------|
|  (C) 2009 Shaun Purcell, GNU General Public License, v2  |
|----------------------------------------------------------|
|  For documentation, citation & bug-report instructions:  |
|        http://pngu.mgh.harvard.edu/purcell/plink/        |
@----------------------------------------------------------@

Skipping web check... [ --noweb ] 
Writing this text to log file [ denovo_noUndo.log ]
Analysis started: Fri Feb 16 13:41:45 2018

Options in effect:
	--cnv-list denovo_noUndo.cnv
	--cnv-make-map
	--noweb
	--out denovo_noUndo


Reading segment list (CNVs) from [ denovo_noUndo.cnv ]
Writing new MAP file to [ denovo_noUndo.cnv.map ]
Wrote 506 unique positions to file

Analysis finished: Fri Feb 16 13:41:45 2018


@----------------------------------------------------------@
|        PLINK!       |     v1.07 

In [69]:
nlen = 500
cnv_types = ['1']
fid = open('/data/sudregp/cnv/conifer84/dnacopy/denovo_noUndo.cnv', 'r')

aff_cnt, ctl_cnt = 0, 0
for line in fid:
    famid, sample, chr, start, end, cnvtype, a, b = line.rstrip().split('\t')
    if (line.find('BP') < 0 and (int(end) - int(start))/1000 > nlen and
        cnvtype in cnv_types):
        if sample in affected:
            aff_cnt += 1
        else:
            ctl_cnt += 1
print 'Affected: %d, controls: %d' % (aff_cnt, ctl_cnt)

Affected: 14, controls: 18


In [71]:
import glob

data_dir = '/home/sudregp/data/cnv/conifer84/dnacopy/'
files = glob.glob(data_dir + '/denovo_noUndo*clean*cnv.indiv')
files.sort()
for f in files:
    t_str = '.'.join(f.split('/')[-1].split('.')[:-2])
    plot_plink_cnvs(f, t_str, verbose=True)

Best disrupted: 
Best protected: 10164


Best disrupted: 1892, 10153, 855, 1976, 10173, 10178
Best protected: 10164, 10131, 10094, 10448, 10215, 10182


Best disrupted: 
Best protected: 


Best disrupted: 1892, 855, 10173, 10178
Best protected: 10131, 10094, 10215, 10182, 10448


Best disrupted: 1892, 855, 10173, 10178
Best protected: 10131, 10094, 10215, 10182, 10448


Best disrupted: 
Best protected: 10164


Best disrupted: 10153, 1976
Best protected: 10164, 10131


Best disrupted: 10153, 1976
Best protected: 10164, 10131


Best disrupted: 1892, 10153, 855, 1976, 10173, 10178
Best protected: 10164, 10131, 10094, 10448, 10215, 10182


Nothing really consistent...

In [73]:
fname = '/home/sudregp/data/cnv/conifer84/dnacopy/denovo_noUndo_clean_genes.reg'
fid = open(fname)
genes = {}
for line in fid:
    if line.find('RANGE') >= 0:
        gene = line.split(' ')[-2]
        genes[gene] = 0
    elif line.find('DUP') > 0 or line.find('DEL') > 0:
        genes[gene] += 1
fid.close()
df = pd.DataFrame.from_dict(genes, orient='index')
df.columns = ['count']
df.sort_values(by='count', ascending=False).head(10)

Unnamed: 0,count
TPTE,8
ANKRD30BP2,6
BAGE2,6
BAGE,6
LOC284801,5
MIR663A,5
FRG1B,4
TEKT4P2,4
NCOR1P1,4
MLLT10P1,4


8 and 6 look interesting... let's see who has them:

In [78]:
dfs = df.sort_values(by='count', ascending=False).head(6)
for index, count in dfs.iterrows():
    nlines = count[0]
    gene = index
    fid = open(fname)
    for line in fid:
        if line.find(' ' + gene + ' ') >= 0:
            print line.rstrip()
            for i in range(nlines + 1):
                line = fid.next()
                print line.rstrip()
    fid.close()

RANGE (+/- 0kb )  [ 21 10906186 10990943 TPTE ]
    FID           IID      PHE  CHR          BP1          BP2   TYPE       KB     OLAP   OLAP_U   OLAP_R
   1976   CLIA_400204        2   21      9483384     10906820    DUP     1423 0.0004461 0.0004212 0.007492
   1895   CLIA_400132        2   21      9483384     10941827    DEL     1458  0.02444  0.02364   0.4205
  10164   CLIA_400135        1   21      9909024     10942903    DUP     1034  0.03551  0.03394   0.4332
   1895   CLIA_400158        1   21      9966251     10942903    DEL    976.7   0.0376  0.03583   0.4332
  10131   CLIA_400142        1   21     10212768     14424017    DEL     4211  0.02013  0.02013        1
  10215   CLIA_400191        1   21     10614874     14585720    DEL     3971  0.02135  0.02135        1
  10033   CLIA_400166        1   21     10862542     14486116    DEL     3624  0.02339  0.02339        1
  10131   CLIA_400140        2   21     10944557     14421260    DEL     3477  0.01334   0.0132   0.5473
RANGE

Nothing here either... some controls have almost all genes diruspted... Granted, these are big CNVs, but still.

# TODO

* Try other alignment method?
* Go back and take a second look at XHMM results using 84 samples.
* Try cnvkit again
* Try CNV plugins for genomestudio