Had to re-run the simplex sample to include only families with two or more kids. Similar to the xhmm_again note from today.

# Python inits

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True)
%matplotlib inline

In [3]:
def classify_cnvs(cnvs, trios):
    inherited = {}
    denovo = {}
    for trio in trios:
        inherited[trio['child']] = []
        denovo[trio['child']] = []

    # need to filter in chr and type first because ranges repeat for every chr,
    # and inheritance is only true for the same type of CNV
    for chr in ['chr%d' % d for d in range(1, 23)]:
        for cnvtype in ['dup', 'del']:
            for trio in trios:
                kid = [cnv for cnv in cnvs[trio['child']] if (cnv[0] == chr and
                                                              cnv[3] == cnvtype)]
                dad = [cnv for cnv in cnvs[trio['father']] if (cnv[0] == chr and
                                                              cnv[3] == cnvtype)]
                mom = [cnv for cnv in cnvs[trio['mother']] if (cnv[0] == chr and
                                                              cnv[3] == cnvtype)]
                for cnv in kid:
                    is_inherited = False
                    # if CNV start is not within any parents boundary AND
                    #         CNV end is not within any parents boundary
                    # it's denovo
                    for pcnv in dad + mom:
                        if ((cnv[1] >= pcnv[1] or cnv[1] <= pcnv[2]) or
                            (cnv[2] >= pcnv[1] or cnv[2] <= pcnv[2])):
                            is_inherited = True
                    if is_inherited:
                        inherited[trio['child']].append(cnv)
                    else:
                        denovo[trio['child']].append(cnv)
    return denovo, inherited


def write2plink(data, fname, trios):
    fid = open(fname, 'w')
    fid.write('FID\tIID\tCHR\tBP1\tBP2\tTYPE\tSCORE\tSITE\n')
    for sample, cnvs in data.iteritems():
        famid = [trio['famid'] for trio in trios if trio['child'] == sample][0]
        for cnv in cnvs:
            ch = cnv[0].replace('chr', '')
            if cnv[3] == 'del':
                cnvt = '1'
            else:
                cnvt = '3'
            if len(cnv) > 4:
                score = cnv[5]
                site = cnv[4]
            else:
                score, site = ['0', '0']
            fid.write('\t'.join([famid, sample, ch, cnv[1], cnv[2], cnvt,
                                 score, site]) + '\n')
    fid.close()

In [4]:
import glob
ped_file = '/data/sudregp/cnv/simplex.ped'
wes_prefix = ['CLIA', 'CCGO', 'WPS']

# cleaning up the data for this analysis
fid = open('/home/sudregp/data/cnv/exclude.txt', 'r')
exclude_list = [line.rstrip() for line in fid]
fid.close()
fid = open('/home/sudregp/data/cnv/exclude_multikids.txt', 'r')
exclude_list += [line.rstrip() for line in fid]
fid.close()

trios = []
affected = []
controls = []
samples = []
famids = []
fid = open(ped_file, 'r')
for line in fid:
    famid, sid, fa, mo, sex, aff = line.rstrip().split('\t')
    if (fa[:4] in wes_prefix and mo[:4] in wes_prefix and
        sid[:4] in wes_prefix and sid not in exclude_list):
        fam = {}
        fam['child'] = sid
        if aff == '1':
            affected.append(sid)
        else:
            controls.append(sid)
        fam['father'] = fa
        fam['mother'] = mo
        fam['famid'] = famid
        trios.append(fam)
        samples += [sid, fa, mo]
        famids.append(famid)
fid.close()
samples = set(samples)
famids = set(famids)

# fid = open('/data/sudregp/cnv/kid_samples.txt', 'r')
# good_kids = [line.rstrip() for line in fid]
# fid.close()

print 'Working with %d samples, in %d families' % (len(samples), len(famids))

Working with 88 samples, in 20 families


# ExomeCopy

In [None]:
%%R

library(exomeCopy)
target.file <- "/data/NCR_SBRB/simplex/SeqCapEZ_Exome_v3.0_Design_Annotation_files/SeqCap_EZ_Exome_v3_hg19_capture_targets.bed"
setwd('~/data/cnv/BAM')
bam.files <- list.files(pattern=".bam$")
keep_me = (bam.files != 'CLIA_400165.bam' & bam.files != 'CLIA_400164.bam' & 
           bam.files != 'CLIA_400155.bam' & bam.files != 'CLIA_400146.bam' &
           bam.files != 'CLIA_400145.bam' & bam.files != 'CLIA_400126.bam' &
           bam.files != 'CLIA_400207.bam' & bam.files != 'CLIA_400208.bam' &
           bam.files != 'CLIA_400168.bam' & bam.files != 'CLIA_400169.bam' &
           bam.files != 'CLIA_400209.bam')
bam.files = bam.files[keep_me]
print(sprintf('Processing %d samples.', length(bam.files)))
sample.names = bam.files
reference.file <- "/fdb/igenomes/Homo_sapiens/UCSC/hg19/Sequence/BWAIndex/genome.fa"
target.df <- read.delim(target.file, header = FALSE)
target <- GRanges(seqname = target.df[, 1], IRanges(start = target.df[, 2] + 1, end = target.df[, 3]))
counts <- target
for (i in 1:length(bam.files)) {
    print(i)
    mcols(counts)[[sample.names[i]]] <- countBamInGRanges(bam.files[i], target)
}
counts$GC <- getGCcontent(target, reference.file)
counts$GC.sq <- counts$GC^2
counts$bg <- generateBackground(sample.names, counts, median)
counts$log.bg <- log(counts$bg + 0.1)
counts$width <- width(counts)
seqnames = sapply(1:22, function(d) sprintf('chr%d', d))

for (seqname in seqnames) {
    df = as.data.frame(counts[seqnames(counts) == seqname])
    for (sample in sample.names) {
        nzeros = sum(df[sample] == 0)
        pct0 = nzeros/nrow(df)
        if (pct0 > .9) {
            print(sprintf('%s in %s looks bad', sample, seqname))
        }
    }
}
    
runExomeCopy <- function(sample.name, seqs, counts) {
    library(exomeCopy) # each cpu needs to load ExomeCopy and have a copy of counts
    lapply(seqs, function(seq.name) exomeCopy(counts[seqnames(counts) == seq.name], sample.name,
                                              X.names = c("log.bg","GC", "GC.sq", "width"),
                                              S = 0:4, d = 2))
}
           
ncpus=10
library(parallel)
cl <- makeCluster(ncpus)
fit.list = parLapply(cl, sample.names, runExomeCopy, seqnames, counts)
stopCluster(cl)
res = compileCopyCountSegments(fit.list)
a = as.data.frame(res)
a = a[a$copy.count!=2,]
write.csv(a, file='~/data/cnv/exomeCopy_multikids/calls.txt', row.names=F)


In [17]:
tool = 'exomeCopy'

fid = open('/data/sudregp/cnv/%s_multikids/calls.txt' % tool, 'r')
cnvs = {}
for sample in samples:
    cnvs[sample] = []
for line in fid:
    if line.find('seqnames') < 0:
        clean_line = line.replace('"', '').rstrip()
        chr, start, end, site, t1, cnvtype, score, t2, t3, sample = clean_line.split(',')
        if int(cnvtype) > 2:
            cnvtype = 'dup'
        else:
            cnvtype = 'del'
        cnvs[sample.replace('.bam', '')].append([chr, start, end, cnvtype, score, site])
fid.close()

dn, inh = classify_cnvs(cnvs, trios)
# exomeCopy generated too many inherited CNVs. Let's play only with denovos for now.
write2plink(dn, '/data/sudregp/cnv/%s_multikids/denovo.cnv' % tool, trios)
write2plink(inh, '/data/sudregp/cnv/%s_multikids/inherited.cnv' % tool, trios)
tmp_cnv = {}
for sample in cnvs.iterkeys():
    if sample in affected + controls:
        tmp_cnv[sample] = cnvs[sample]
write2plink(tmp_cnv, '/data/sudregp/cnv/%s_multikids/all.cnv' % tool, trios)

# cn.mops

In [None]:
%%R

library(cn.mops)
setwd('~/data/cnv/BAM')
bam.files <- list.files(pattern=".bam$")
keep_me = (bam.files != 'CLIA_400165.bam' & bam.files != 'CLIA_400164.bam' & 
           bam.files != 'CLIA_400155.bam' & bam.files != 'CLIA_400146.bam' &
           bam.files != 'CLIA_400145.bam' & bam.files != 'CLIA_400126.bam' &
           bam.files != 'CLIA_400207.bam' & bam.files != 'CLIA_400208.bam' &
           bam.files != 'CLIA_400168.bam' & bam.files != 'CLIA_400169.bam' &
           bam.files != 'CLIA_400209.bam')
bam.files = bam.files[keep_me]
print(sprintf('Processing %d samples.', length(bam.files)))
BAMFiles = bam.files
segments <- read.table("/data/NCR_SBRB/simplex/SeqCapEZ_Exome_v3.0_Design_Annotation_files/SeqCap_EZ_Exome_v3_hg19_capture_targets.bed",sep="\t",as.is=TRUE)
gr <- GRanges(segments[,1],IRanges(segments[,2],segments[,3]))
X <- getSegmentReadCountsFromBAM(BAMFiles,GR=gr)
resCNMOPS <- exomecn.mops(X)
resCNMOPS <- calcIntegerCopyNumbers(resCNMOPS)
segm <- as.data.frame(segmentation(resCNMOPS))
CNVs <- as.data.frame(cnvs(resCNMOPS))
CNVRegions <- as.data.frame(cnvr(resCNMOPS))
write.csv(segm,file="/home/sudregp/data/cnv/cn.mops_multikids/segmentation.csv", row.names=F)
write.csv(CNVs,file="/home/sudregp/data/cnv/cn.mops_multikids/cnvs.csv", row.names=F)
write.csv(CNVRegions,file="/home/sudregp/data/cnv/cn.mops_multikids/cnvr.csv", row.names=F)


In [6]:
tool = 'cn.mops'

fid = open('/data/sudregp/cnv/%s_multikids/cnvs.csv' % tool, 'r')
cnvs = {}
for sample in samples:
    cnvs[sample] = []
for line in fid:
    if line.find('seqnames') < 0:
        clean_line = line.replace('"', '').rstrip()
        chr, start, end, site, t1, sample, t2, t3, cnvtype = clean_line.split(',')
        if cnvtype.find('0') or cnvtype.find('1'):
            cnvtype = 'dup'
        else:
            cnvtype = 'del'
        cnvs[sample.split('.')[0]].append([chr, start, end, cnvtype, '0', site])
fid.close()

dn, inh = classify_cnvs(cnvs, trios)
write2plink(dn, '/data/sudregp/cnv/%s_multikids/denovo.cnv' % tool, trios)
write2plink(inh, '/data/sudregp/cnv/%s_multikids/inherited.cnv' % tool, trios)
tmp_cnv = {}
for sample in cnvs.iterkeys():
    if sample in affected + controls:
        tmp_cnv[sample] = cnvs[sample]
write2plink(tmp_cnv, '/data/sudregp/cnv/%s_multikids/all.cnv' % tool, trios)

# Conifer

In [None]:
%%bash

cd ~/data/cnv/conifer_multikids
cp -r ../conifer/RPKM .

module load python
source activate my_py2.7
while read s; do
    rm RPKM/${s}.bam*;
done < ../exclude_multikids.txt  # already removed the other 2 samples
ls -1 RPKM | wc -l

python /data/NCR_SBRB/software/conifer_v0.2.2/conifer.py analyze \
    --probes /data/NCR_SBRB/simplex/SeqCapEZ_Exome_v3.0_Design_Annotation_files/SeqCap_EZ_Exome_v3_hg19_capture_targets.bed \
    --rpkm_dir ./RPKM/ --output analysis.hdf5 --svd 6 \
    --write_svals singular_values.txt --plot_scree screeplot.png \
    --write_sd sd_values.txt
python /data/NCR_SBRB/software/conifer_v0.2.2/conifer.py call \
    --input analysis.hdf5 --output calls.txt


In [6]:
data_dir = '/data/sudregp/cnv/'
fid = open(data_dir + '/conifer_multikids/calls.txt', 'r')
cnvs = {}
for sample in samples:
    cnvs[sample] = []
for line in fid:
    if line.find('sampleID') < 0:
        sample, chr, start, end, cnvtype = line.rstrip().split('\t')
        cnvs[sample.split('.')[0]].append([chr, start, end, cnvtype])
fid.close()

dn, inh = classify_cnvs(cnvs, trios)
write2plink(dn, data_dir + '/conifer_multikids/denovo.cnv', trios)
write2plink(inh, data_dir + '/conifer_multikids/inherited.cnv', trios)
tmp_cnv = {}
for sample in cnvs.iterkeys():
    if sample in affected + controls:
        tmp_cnv[sample] = cnvs[sample]
write2plink(tmp_cnv, data_dir + '/conifer_multikids/all.cnv', trios)

# PLINK

Now it's just a matter of running PLINK on all files. Just cange the variable below and not worry about the output of the cell.

In [18]:
%%bash

module load plink/1.07
tool=exomeCopy
cd /data/sudregp/cnv/${tool}_multikids/
cat ../penncnv/wellknown_region_hg19 > bad_regions.list
cp ../penncnv/genes.txt ../penncnv/hg19_allenBrainGene_trimmed.txt ../penncnv/glist-hg19 .

# exomeCopy generated too many inherited CNVs. Let's play only with denovos for now.
for cnvtype in denovo inherited all; do
    cnvname=${cnvtype}.cnv
    plink --cnv-list $cnvname --cnv-make-map --noweb --out ${cnvtype};
    
    # remove bad regions
    plink --map ${cnvname}.map --fam ../simplex.ped --cnv-list $cnvname \
        --noweb --1 --cnv-exclude bad_regions.list --cnv-overlap .5 \
        --cnv-write --out ${cnvtype}_clean
    plink --cnv-list ${cnvtype}_clean.cnv --cnv-make-map --noweb --1 \
        --out ${cnvtype}_clean
    
    for qc in '' '_clean'; do
        cnvname=${cnvtype}${qc}.cnv
        # whole burden
        plink --map ${cnvname}.map --fam ../simplex.ped --cnv-list $cnvname \
            --noweb --1 --cnv-check-no-overlap --out ${cnvtype}${qc}_burden;
        # gene sets
        plink --map ${cnvname}.map --fam ../simplex.ped --cnv-list $cnvname \
            --noweb --1 --cnv-intersect glist-hg19 --cnv-verbose-report-regions \
            --cnv-subset genes.txt --out ${cnvtype}${qc}_genes;
        plink --map ${cnvname}.map --fam ../simplex.ped --cnv-list $cnvname \
            --noweb --1 --cnv-intersect glist-hg19 --cnv-verbose-report-regions \
            --cnv-subset hg19_allenBrainGene_trimmed.txt \
            --out ${cnvtype}${qc}_brainGenes;
        # subtypes only
        for sub in del dup; do
            plink --map ${cnvname}.map --fam ../simplex.ped --cnv-list $cnvname \
            --noweb --1 --cnv-${sub} --out ${cnvtype}${qc}_${sub}Burden;
            # gene sets
            plink --map ${cnvname}.map --fam ../simplex.ped --cnv-list $cnvname \
                --noweb --1 --cnv-intersect glist-hg19 --cnv-verbose-report-regions \
                --cnv-subset genes.txt --cnv-${sub} \
                --out ${cnvtype}${qc}_${sub}Genes;
            plink --map ${cnvname}.map --fam ../simplex.ped --cnv-list $cnvname \
                --noweb --1 --cnv-intersect glist-hg19 --cnv-verbose-report-regions \
                --cnv-subset hg19_allenBrainGene_trimmed.txt --cnv-${sub} \
                --out ${cnvtype}${qc}_${sub}BrainGenes;
        done;
    done;
done

Process is interrupted.
