This might take a while to run, as we're adding in the multiplex to the simplex cohort... Note that most of the code I'll be running in separate terminals, because they take too long to run.

# Python inits

In [6]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True)
%matplotlib inline

In [7]:
def classify_cnvs(cnvs, trios):
    inherited = {}
    denovo = {}
    for trio in trios:
        inherited[trio['child']] = []
        denovo[trio['child']] = []

    # need to filter in chr and type first because ranges repeat for every chr,
    # and inheritance is only true for the same type of CNV
    for chr in ['chr%d' % d for d in range(1, 23)]:
        for cnvtype in ['dup', 'del']:
            for trio in trios:
                kid = [cnv for cnv in cnvs[trio['child']] if (cnv[0] == chr and
                                                              cnv[3] == cnvtype)]
                dad = [cnv for cnv in cnvs[trio['father']] if (cnv[0] == chr and
                                                              cnv[3] == cnvtype)]
                mom = [cnv for cnv in cnvs[trio['mother']] if (cnv[0] == chr and
                                                              cnv[3] == cnvtype)]
                for cnv in kid:
                    is_inherited = False
                    # if CNV start is not within any parents boundary AND
                    #         CNV end is not within any parents boundary
                    # it's denovo
                    for pcnv in dad + mom:
                        if ((cnv[1] >= pcnv[1] or cnv[1] <= pcnv[2]) or
                            (cnv[2] >= pcnv[1] or cnv[2] <= pcnv[2])):
                            is_inherited = True
                    if is_inherited:
                        inherited[trio['child']].append(cnv)
                    else:
                        denovo[trio['child']].append(cnv)
    return denovo, inherited


def write2plink(data, fname, trios):
    fid = open(fname, 'w')
    fid.write('FID\tIID\tCHR\tBP1\tBP2\tTYPE\tSCORE\tSITE\n')
    for sample, cnvs in data.iteritems():
        famid = [trio['famid'] for trio in trios if trio['child'] == sample][0]
        for cnv in cnvs:
            ch = cnv[0].replace('chr', '')
            if cnv[3] == 'del':
                cnvt = '1'
            else:
                cnvt = '3'
            if len(cnv) > 4:
                score = cnv[5]
                site = cnv[4]
            else:
                score, site = ['0', '0']
            fid.write('\t'.join([famid, sample, ch, cnv[1], cnv[2], cnvt,
                                 score, site]) + '\n')
    fid.close()

In [16]:
# figure out who is who in each trio
import glob
ped_files = ['/data/sudregp/multiplex_simplex/simplex.ped',
             '/data/sudregp/multiplex_simplex/multiplex.ped']
wes_prefix = ['CLIA', 'CCGO', 'WPS']
trios = []
affected = []
controls = []
samples = []
famids = []
for ped_file in ped_files:
    fid = open(ped_file, 'r')
    for line in fid:
        famid, sid, fa, mo, sex, aff = line.rstrip().split('\t')
        # if the current ID and its parents have WES data, and the sample is 
        # not in yet
        if (fa.split('_')[0] in wes_prefix and
            mo.split('_')[0] in wes_prefix and
            sid.split('_')[0] in wes_prefix and sid not in samples):
            fam = {}
            fam['child'] = sid
            if aff == '1':
                affected.append(sid)
            else:
                controls.append(sid)
            fam['father'] = fa
            fam['mother'] = mo
            fam['famid'] = famid
            trios.append(fam)
            samples += [sid, fa, mo]
            famids.append(famid)
    fid.close()
samples = set(samples)
famids = set(famids)
kids = set(affected + controls)

print 'Unique samples:', len(samples)
print 'Unique families:', len(famids)
print 'Unique children:', len(kids)

Unique samples: 246
Unique families: 43
Unique children: 129


So, 246 is different than the 250 we're currently seeing in samples_unique.txt. Why the discrepancy?

In [17]:
fid = open('/home/sudregp/data/multiplex_simplex/samples_unique.txt', 'r')
s250 = [l.rstrip() for l in fid]
fid.close()
for s in s250:
    if s not in samples:
        print s

CCGO_800809
CCGO_800940
CLIA_400168
CLIA_400169


OK, these are cases of kids and parents without the other parent. Like when we analyzed the simplex only, let's leave them out of the analysis.

# XHMM

In [18]:
%%bash
cd ~/data/multiplex_simplex
rm swarm.xhmm
while read s; do 
    echo "bash ~/data/multiplex_simplex/xhmm_get_DOC.sh $s" >> swarm.xhmm;
done < samples_unique.txt
wc -l swarm.xhmm

250 swarm.xhmm


In [3]:
%%bash
cd ~/data/multiplex_simplex
swarm -f swarm.xhmm -t 4 -g 60 --job-name xhmm_DOC --logdir trash \
    --time=48:00:00

58593164


In [None]:
%%bash

exome_targets='/data/NCR_SBRB/simplex/SeqCapEZ_Exome_v3.0_Design_Annotation_files/SeqCap_EZ_Exome_v3_hg19_capture_targets.bed'
gatk_memory="50g"
ref_fa='/fdb/GATK_resource_bundle/hg19-2.8/ucsc.hg19.fasta'
out_dir='/data/sudregp/multiplex_simplex/xhmm'

cd $out_dir
module load GATK
module load XHMM

GATK -m ${gatk_memory} GCContentByInterval -L ${exome_targets} -R ${ref_fa} -o ./DATA.locus_GC.txt
cat ./DATA.locus_GC.txt | awk '{if ($2 < 0.1 || $2 > 0.9) print $1}' > ./extreme_gc_targets.txt

# merging all subjects in the directory
ls -1 *.sample_interval_summary > depth_list.txt;
grep -v -f ../exclude.txt depth_list.txt > depth_list2.txt

xhmm --mergeGATKdepths --GATKdepthsList=depth_list2.txt -o ./DATA.RD.txt;
ra
/usr/local/apps/XHMM/2016-01-04/sources/scripts/interval_list_to_pseq_reg ${exome_targets} > ./EXOME.targets.reg

xhmm --matrix -r ./DATA.RD.txt --centerData --centerType target \
-o ./DATA.filtered_centered.RD.txt \
--outputExcludedTargets ./DATA.filtered_cenratered.RD.txt.filtered_targets.txt \
--outputExcludedSamples ./DATA.filtered_centered.RD.txt.filtered_samples.txt \
--minTargetSize 10 --maxTargetSize 10000 \
--minMeanTargetRD 10 --maxMeanTargetRD 500 \
--minMeanSampleRD 25 --maxMeanSampleRD 200 \
--maxSdSampleRD 150

xhmm --PCA -r ./DATA.filtered_centered.RD.txt --PCAfiles ./DATA.RD_PCA

xhmm --normalize -r ./DATA.filtered_centered.RD.txt --PCAfiles ./DATA.RD_PCA \
--normalizeOutput ./DATA.PCA_normalized.txt \
--PCnormalizeMethod PVE_mean --PVE_mean_factor 0.7

xhmm --matrix -r ./DATA.PCA_normalized.txt --centerData --centerType sample --zScoreData \
-o ./DATA.PCA_normalized.filtered.sample_zscores.RD.txt \
--outputExcludedTargets ./DATA.PCA_normalized.filtered.sample_zscores.RD.txt.filtered_targets.txt \
--outputExcludedSamples ./DATA.PCA_normalized.filtered.sample_zscores.RD.txt.filtered_samples.txt \
--maxSdTargetRD 30

xhmm --matrix -r ./DATA.RD.txt \
--excludeTargets ./DATA.filtered_centered.RD.txt.filtered_targets.txt \
--excludeTargets ./DATA.PCA_normalized.filtered.sample_zscores.RD.txt.filtered_targets.txt \
--excludeSamples ./DATA.filtered_centered.RD.txt.filtered_samples.txt \
--excludeSamples ./DATA.PCA_normalized.filtered.sample_zscores.RD.txt.filtered_samples.txt \
-o ./DATA.same_filtered.RD.txt

xhmm --discover -p params.txt -r ./DATA.PCA_normalized.filtered.sample_zscores.RD.txt \
    -R ./DATA.same_filtered.RD.txt -c ./DATA.xcnv -a ./DATA.aux_xcnv -s ./DATA

xhmm --genotype -p params.txt -r ./DATA.PCA_normalized.filtered.sample_zscores.RD.txt \
    -R ./DATA.same_filtered.RD.txt -g ./DATA.xcnv -F $ref_fa -v ./DATA.vcf



# ExomeCopy

In [None]:
%%R

library(exomeCopy)
target.file <- "/data/NCR_SBRB/simplex/SeqCapEZ_Exome_v3.0_Design_Annotation_files/SeqCap_EZ_Exome_v3_hg19_capture_targets.bed"
setwd('~/data/multiplex_simplex/BAM')
bam.files <- list.files(pattern=".bam$")
keep_me = bam.files != 'CLIA_400168.bam' & bam.files != 'CLIA_400169.bam' & bam.files != 'CCGO_800809.bam' & bam.files != 'CCGO_800940.bam'
bam.files = bam.files[keep_me]
sample.names = bam.files
reference.file <- "/fdb/igenomes/Homo_sapiens/UCSC/hg19/Sequence/BWAIndex/genome.fa"
target.df <- read.delim(target.file, header = FALSE)
target <- GRanges(seqname = target.df[, 1], IRanges(start = target.df[, 2] + 1, end = target.df[, 3]))
counts <- target
for (i in 1:length(bam.files)) {
    print(i)
    mcols(counts)[[sample.names[i]]] <- countBamInGRanges(bam.files[i], target)
}
counts$GC <- getGCcontent(target, reference.file)
counts$GC.sq <- counts$GC^2
counts$bg <- generateBackground(sample.names, counts, median)
counts$log.bg <- log(counts$bg + 0.1)
counts$width <- width(counts)
seqnames = sapply(1:22, function(d) sprintf('chr%d', d))

for (seqname in seqnames) {
    df = as.data.frame(counts[seqnames(counts) == seqname])
    for (sample in sample.names) {
        nzeros = sum(df[sample] == 0)
        pct0 = nzeros/nrow(df)
        if (pct0 > .9) {
            print(sprintf('%s in %s looks bad', sample, seqname))
        }
    }
}
    
runExomeCopy <- function(sample.name, seqs, counts) {
    library(exomeCopy) # each cpu needs to load ExomeCopy and have a copy of counts
    lapply(seqs, function(seq.name) exomeCopy(counts[seqnames(counts) == seq.name], sample.name,
                                              X.names = c("log.bg","GC", "GC.sq", "width"),
                                              S = 0:4, d = 2))
}
           
ncpus=10
library(parallel)
cl <- makeCluster(ncpus)
fit.list = parLapply(cl, sample.names, runExomeCopy, seqnames, counts)
stopCluster(cl)
res = compileCopyCountSegments(fit.list)
a = as.data.frame(res)
a = a[a$copy.count!=2,]
write.csv(a, file='~/data/multiplex_simplex/exomeCopy/calls.txt', row.names=F)


# cn.mops

In [None]:
%%R

library(cn.mops)
setwd('~/data/multiplex_simplex/BAM')
BAMFiles <- list.files(pattern=".bam$")
keep_me = BAMFiles != 'CLIA_400168.bam' & BAMFiles != 'CLIA_400169.bam' & BAMFiles != 'CCGO_800809.bam' & BAMFiles != 'CCGO_800940.bam'
BAMFiles = BAMFiles[keep_me]
segments <- read.table("/data/NCR_SBRB/simplex/SeqCapEZ_Exome_v3.0_Design_Annotation_files/SeqCap_EZ_Exome_v3_hg19_capture_targets.bed",sep="\t",as.is=TRUE)
gr <- GRanges(segments[,1],IRanges(segments[,2],segments[,3]))
X <- getSegmentReadCountsFromBAM(BAMFiles,GR=gr)
resCNMOPS <- exomecn.mops(X)
resCNMOPS <- calcIntegerCopyNumbers(resCNMOPS)
segm <- as.data.frame(segmentation(resCNMOPS))
CNVs <- as.data.frame(cnvs(resCNMOPS))
CNVRegions <- as.data.frame(cnvr(resCNMOPS))
write.csv(segm,file="/home/sudregp/data/multiplex_simplex/cn.mops/segmentation.csv", row.names=F)
write.csv(CNVs,file="/home/sudregp/data/multiplex_simplex/cn.mops/cnvs.csv", row.names=F)
write.csv(CNVRegions,file="/home/sudregp/data/multiplex_simplex/cn.mops/cnvr.csv", row.names=F)


# Conifer

In [5]:
%%bash

cd ~/data/multiplex_simplex/conifer
rm swarm*
cd ../BAM/
for i in `/bin/ls -1 *.bam`; do 
    echo "python /data/NCR_SBRB/software/conifer_v0.2.2/conifer.py rpkm --probes /data/NCR_SBRB/simplex/SeqCapEZ_Exome_v3.0_Design_Annotation_files/SeqCap_EZ_Exome_v3_hg19_capture_targets.bed --input /data/sudregp/multiplex_simplex/BAM/${i} --output /data/sudregp/multiplex_simplex/conifer/RPKM/${i}.rpkm.txt" >> ../conifer/swarm.rpkm;
done
cd ../conifer/
swarm -f swarm.rpkm --logdir trash --job-name rpkm --partition quick

58595262


In [None]:
%%bash

cd ~/data/multiplex_simplex/conifer
module load python
source activate my_py2.7
rm RPKM/CLIA_400168.bam* RPKM/CLIA_400169.bam* RPKM/CCGO_800940.bam* RPKM/CCGO_800809.bam*
ls -1 RPKM | wc -l
python /data/NCR_SBRB/software/conifer_v0.2.2/conifer.py analyze \
    --probes /data/NCR_SBRB/simplex/SeqCapEZ_Exome_v3.0_Design_Annotation_files/SeqCap_EZ_Exome_v3_hg19_capture_targets.bed \
    --rpkm_dir ./RPKM/ --output analysis.hdf5 --svd 6 \
    --write_svals singular_values.txt --plot_scree screeplot.png \
    --write_sd sd_values.txt
python /data/NCR_SBRB/software/conifer_v0.2.2/conifer.py call \
    --input analysis.hdf5 --output calls.txt


In [19]:
data_dir = '/data/sudregp/multiplex_simplex/'
fid = open(data_dir + '/conifer/calls.txt', 'r')
cnvs = {}
for sample in samples:
    cnvs[sample] = []
for line in fid:
    if line.find('sampleID') < 0:
        sample, chr, start, end, cnvtype = line.rstrip().split('\t')
        cnvs[sample.split('.')[0]].append([chr, start, end, cnvtype])
fid.close()

dn, inh = classify_cnvs(cnvs, trios)
write2plink(dn, data_dir + '/conifer/denovo.cnv', trios)
write2plink(inh, data_dir + '/conifer/inherited.cnv', trios)
tmp_cnv = {}
for sample in cnvs.iterkeys():
    if sample in affected + controls:
        tmp_cnv[sample] = cnvs[sample]
write2plink(tmp_cnv, data_dir + '/conifer/all.cnv', trios)

# TODO

* Check if all samples are good (ExomeCopy pipeline, SVD plots from conifer, XHMM output) and re-run everything if not

# Trash

Let's make sure no sample was removed by XHMM this time:

In [2]:
cd ~/data/cnv/xhmm
ls -ltr DATA.filtered_centered.RD.txt.filtered_samples.txt

-rw-rw---- 1 sudregp sudregp 0 Dec 28 16:35 DATA.filtered_centered.RD.txt.filtered_samples.txt


Let's see what we can get in PLINK/PSEQ:

In [5]:
module load plinkseq
module load plink/1.07

pseq DATA new-project
# adding a first column with subject ID for PSEQ
cut -f 2 ../simplex.ped > junk.txt
paste junk.txt ../simplex.ped > simplex.ped.info
pseq DATA load-pedigree --file simplex.ped.info
pseq DATA load-vcf --vcf DATA.vcf
pseq DATA cnv-denovo --noweb --minSQ 60 --minNQ 60 --out DATA

[-] Unloading GSL 2.2.1 ...
[-] Unloading Graphviz v2.38.0 ...
[-] Unloading gdal 2.0 ...
[-] Unloading proj 4.9.2 ...
[-] Unloading gcc 4.9.1 ...
[-] Unloading openmpi 1.10.0 for GCC 4.9.1
[-] Unloading tcl_tk 8.6.3
[-] Unloading Zlib 1.2.8 ...
[-] Unloading Bzip2 1.0.6 ...
[-] Unloading pcre 8.38 ...
[-] Unloading liblzma 5.2.2 ...
[-] Unloading libjpeg-turbo 1.5.1 ...
[-] Unloading tiff 4.0.7 ...
[-] Unloading curl 7.46.0 ...
[-] Unloading boost libraries v1.65 ...
[-] Unloading R 3.4.0 on cn3238
[+] Loading GSL 2.2.1 ...
[+] Loading Graphviz v2.38.0 ...
[+] Loading gdal 2.0 ...
[+] Loading proj 4.9.2 ...
[+] Loading gcc 4.9.1 ...
[+] Loading openmpi 1.10.0 for GCC 4.9.1
[+] Loading tcl_tk 8.6.3
[+] Loading Zlib 1.2.8 ...
[+] Loading Bzip2 1.0.6 ...
[+] Loading pcre 8.38 ...
[+] Loading liblzma 5.2.2 ...
[-] Unloading Zlib 1.2.8 ...
[+] Loading Zlib 1.2.8 ...
[-] Unloading liblzma 5.2.2 ...
[+] Loading liblzma 5.2.2 ...
[+] Loading libjpeg-turbo 1.5.1 ...
[+] Loading tiff 4.0.7 ...


Now, just like what we did for PennCNV, let's create PLINK files for denovo, inherited, and everything. We do have this knob of quality to play with beforehand, so let's keep that in mind.

If I'm just interested in numbers, I could just grab the information from the DATA.denovo.cnv.indiv file, which looks like this:

In [6]:
head DATA.denovo.cnv.indiv

#SUMMARY	CHILD	CHILD_CNV	IN_PATERNAL	IN_MATERNAL	IN_PATERNAL_AND_MATERNAL	MISSING_PATERNAL	MISSING_MATERNAL	DE_NOVO	PATERNAL_TRANSMITTED	PATERNAL_NON_TRANSMITTED	PATERNAL_UNKNOWN	MATERNAL_TRANSMITTED	MATERNAL_NON_TRANSMITTED	MATERNAL_UNKNOWN
SUMMARY	CCGO_800979	105	18	8	1	47	13	27	17	47	17	7	80	10
SUMMARY	CCGO_800980	98	13	10	0	36	23	25	11	47	23	6	72	19
SUMMARY	CLIA_400121	104	34	23	7	40	42	8	16	27	18	8	25	17
SUMMARY	CLIA_400122	136	49	63	27	45	46	11	11	11	16	22	30	22
SUMMARY	CLIA_400123	112	10	22	1	43	46	26	2	10	6	13	40	21
SUMMARY	CLIA_400125	99	30	37	9	38	28	7	16	25	24	16	18	19
SUMMARY	CLIA_400126	115	29	35	8	30	41	20	11	42	25	19	25	21
SUMMARY	CLIA_400127	114	28	28	4	38	48	14	14	30	20	15	23	16
SUMMARY	CLIA_400128	126	39	23	5	50	47	12	24	20	19	13	14	16


But I'll likely want to do some further filtering in PLINK, so I'l need the non .indiv file.

In [8]:
cat DATA.denovo.cnv | cut -f 1 | sort | uniq

#CLASS
DENOVO
MATERNAL_NON_TRANSMITTED
MATERNAL_TRANSMITTED
MATERNAL_UNKNOWN
PATERNAL_NON_TRANSMITTED
PATERNAL_TRANSMITTED
PATERNAL_UNKNOWN


Let's format the file into something PLINK can understand. XHMM has a script to convert from xcnv to cnv, so we just need to filter all DENOVO calls from PSEQ in the main xcnv file:

In [9]:
grep DENOVO DATA.denovo.cnv > pseq_DENOVO.txt
# borrow the header row
head -1 DATA.xcnv > denovo.xcnv;

while read sample; do
    grep $sample DATA.xcnv > sample.xcnv;
    for cnv in `grep $sample pseq_DENOVO.txt | cut -f 3 -`; do
        # replacing .. by -
        cnv=`echo $cnv | sed -e 's/\.\./\-/'`;
        grep $cnv sample.xcnv >> denovo.xcnv; 
    done;
done < ../kid_samples.txt;
/usr/local/apps/XHMM/2016-01-04/sources/scripts/xcnv_to_cnv denovo.xcnv > tmp.cnv
# switch around FAMID and IID columns, and remove header
awk '{OFS="\t"; if ( $3 != "CHR" ) {print $2, $1, $3, $4, $5, $6, $7, $8 }}' tmp.cnv > denovo.cnv
rm sample.xcnv pseq_DENOVO.txt tmp.cnv denovo.xcnv

In [10]:
grep MATERNAL_TRANSMITTED DATA.denovo.cnv > pseq_TRANSMITTED.txt
grep PATERNAL_TRANSMITTED DATA.denovo.cnv >> pseq_TRANSMITTED.txt
# borrow the header row
head -1 DATA.xcnv > inherited.xcnv;

while read sample; do
    grep $sample DATA.xcnv > sample.xcnv;
    for cnv in `grep $sample pseq_TRANSMITTED.txt | cut -f 3 -`; do
        # replacing .. by -
        cnv=`echo $cnv | sed -e 's/\.\./\-/'`;
        grep $cnv sample.xcnv >> inherited.xcnv; 
    done;
done < ../kid_samples.txt;
/usr/local/apps/XHMM/2016-01-04/sources/scripts/xcnv_to_cnv inherited.xcnv > tmp.cnv
# switch around FAMID and IID columns, and remove header
awk '{OFS="\t"; if ( $3 != "CHR" ) {print $2, $1, $3, $4, $5, $6, $7, $8 }}' tmp.cnv > inherited.cnv
rm sample.xcnv pseq_TRANSMITTED.txt tmp.cnv inherited.xcnv

In [11]:
# borrow the header row
head -1 DATA.xcnv > all.xcnv;

# effectively just filtering DATA.xcnv to keep only kids
while read sample; do
    grep $sample DATA.xcnv >> all.xcnv;
done < ../kid_samples.txt;
/usr/local/apps/XHMM/2016-01-04/sources/scripts/xcnv_to_cnv all.xcnv > tmp.cnv
# switch around FAMID and IID columns, and remove header
awk '{OFS="\t"; if ( $3 != "CHR" ) {print $2, $1, $3, $4, $5, $6, $7, $8 }}' tmp.cnv > all.cnv
rm tmp.cnv all.xcnv

The issue here is that PSEQ doesn't carry forward the FAMID, so PLINK doesn't know how to add the information back to the sample IDs, which all have FAMIDs in the .ped. So, we need to create a .ped without FAMIDs, whic is fine since at this point all the family information has already been used, and PLINK doesn't need it anymore.

In [13]:
for cnvtype in all denovo inherited; do
    cnvname=${cnvtype}.cnv
    plink --cnv-list $cnvname --cnv-make-map --noweb --out ${cnvtype};
    # whole burden
    plink --map ${cnvname}.map --fam ../simplex_nofamid.ped --cnv-list $cnvname \
        --noweb --1 --cnv-check-no-overlap --out ${cnvtype}_burden;
    # gene sets
    plink --map ${cnvname}.map --fam ../simplex_nofamid.ped --cnv-list $cnvname \
        --noweb --1 --cnv-intersect glist-hg19 --cnv-verbose-report-regions \
        --cnv-subset genes.txt --out ${cnvtype}_genes;
    plink --map ${cnvname}.map --fam ../simplex_nofamid.ped --cnv-list $cnvname \
        --noweb --1 --cnv-intersect glist-hg19 --cnv-verbose-report-regions \
        --cnv-subset hg19_allenBrainGene_trimmed.txt \
        --out ${cnvtype}_brainGenes;
    # subtypes only
    for sub in del dup; do
        plink --map ${cnvname}.map --fam ../simplex_nofamid.ped --cnv-list $cnvname \
        --noweb --1 --cnv-${sub} --out ${cnvtype}_${sub}Burden;
        # gene sets
        plink --map ${cnvname}.map --fam ../simplex_nofamid.ped --cnv-list $cnvname \
            --noweb --1 --cnv-intersect glist-hg19 --cnv-verbose-report-regions \
            --cnv-subset genes.txt --cnv-${sub} \
            --out ${cnvtype}_${sub}Genes;
        plink --map ${cnvname}.map --fam ../simplex_nofamid.ped --cnv-list $cnvname \
            --noweb --1 --cnv-intersect glist-hg19 --cnv-verbose-report-regions \
            --cnv-subset hg19_allenBrainGene_trimmed.txt --cnv-${sub} \
            --out ${cnvtype}_${sub}BrainGenes;
    done;
done


@----------------------------------------------------------@
|        PLINK!       |     v1.07      |   10/Aug/2009     |
|----------------------------------------------------------|
|  (C) 2009 Shaun Purcell, GNU General Public License, v2  |
|----------------------------------------------------------|
|  For documentation, citation & bug-report instructions:  |
|        http://pngu.mgh.harvard.edu/purcell/plink/        |
@----------------------------------------------------------@

Skipping web check... [ --noweb ] 
Writing this text to log file [ all.log ]
Analysis started: Tue Jan  2 13:13:15 2018

Options in effect:
	--cnv-list all.cnv
	--cnv-make-map
	--noweb
	--out all


Reading segment list (CNVs) from [ all.cnv ]
Writing new MAP file to [ all.cnv.map ]
Wrote 10414 unique positions to file

Analysis finished: Tue Jan  2 13:13:15 2018


@----------------------------------------------------------@
|        PLINK!       |     v1.07      |   10/Aug/2009     |
|--------------------

|----------------------------------------------------------|
|  (C) 2009 Shaun Purcell, GNU General Public License, v2  |
|----------------------------------------------------------|
|  For documentation, citation & bug-report instructions:  |
|        http://pngu.mgh.harvard.edu/purcell/plink/        |
@----------------------------------------------------------@

Skipping web check... [ --noweb ] 
Writing this text to log file [ all_delGenes.log ]
Analysis started: Tue Jan  2 13:13:23 2018

Options in effect:
	--map all.cnv.map
	--fam ../simplex_nofamid.ped
	--cnv-list all.cnv
	--noweb
	--1
	--cnv-intersect glist-hg19
	--cnv-verbose-report-regions
	--cnv-subset genes.txt
	--cnv-del
	--out all_delGenes

Reading marker information from [ all.cnv.map ]
10414 (of 10414) markers to be included from [ all.cnv.map ]
Reading individual information from [ ../simplex_nofamid.ped ]
Reading pedigree information from [ ../simplex_nofamid.ped ] 
99 individuals read from [ ../simplex_nofamid.ped ] 


	--cnv-subset hg19_allenBrainGene_trimmed.txt
	--cnv-dup
	--out all_dupBrainGenes

Reading marker information from [ all.cnv.map ]
10414 (of 10414) markers to be included from [ all.cnv.map ]
Reading individual information from [ ../simplex_nofamid.ped ]
Reading pedigree information from [ ../simplex_nofamid.ped ] 
99 individuals read from [ ../simplex_nofamid.ped ] 
99 individuals with nonmissing phenotypes
Assuming a disease phenotype (0=unaff, 1=aff, other=miss)
21 cases, 78 controls and 0 missing
56 males, 43 females, and 0 of unspecified sex
Reading intersection subset list from [ hg19_allenBrainGene_trimmed.txt ]
Looking for subset of 606 ranges
Reading CNV intersection list from [ glist-hg19 ]
Read 577 ranges to intersect with CNV list
Could not find 29 ranges
Writing this list to [ all_dupBrainGenes.notfound ] 

Reading segment list (CNVs) from [ all.cnv ]
Writing back list to intersected regions to [ all_dupBrainGenes.reg ]
7294 mapped to a person, of which 3678 passed filters

|  For documentation, citation & bug-report instructions:  |
|        http://pngu.mgh.harvard.edu/purcell/plink/        |
@----------------------------------------------------------@

Skipping web check... [ --noweb ] 
Writing this text to log file [ denovo_delBurden.log ]
Analysis started: Tue Jan  2 13:13:32 2018

Options in effect:
	--map denovo.cnv.map
	--fam ../simplex_nofamid.ped
	--cnv-list denovo.cnv
	--noweb
	--1
	--cnv-del
	--out denovo_delBurden

Reading marker information from [ denovo.cnv.map ]
1843 (of 1843) markers to be included from [ denovo.cnv.map ]
Reading individual information from [ ../simplex_nofamid.ped ]
Reading pedigree information from [ ../simplex_nofamid.ped ] 
99 individuals read from [ ../simplex_nofamid.ped ] 
99 individuals with nonmissing phenotypes
Assuming a disease phenotype (0=unaff, 1=aff, other=miss)
21 cases, 78 controls and 0 missing
56 males, 43 females, and 0 of unspecified sex

Reading segment list (CNVs) from [ denovo.cnv ]
701 mapped to a

21 cases, 78 controls and 0 missing
56 males, 43 females, and 0 of unspecified sex
Reading intersection subset list from [ genes.txt ]
Looking for subset of 25303 ranges
Reading CNV intersection list from [ glist-hg19 ]
Read 25646 ranges to intersect with CNV list

Reading segment list (CNVs) from [ denovo.cnv ]
Writing back list to intersected regions to [ denovo_dupGenes.reg ]
701 mapped to a person, of which 293 passed filters
254 intersected with one or more specified region
254 of 701 mapped as valid segments
 CopyN Case/Control
     3    119 / 135

Writing per-individual summary to [ denovo_dupGenes.cnv.indiv ]
Writing positional summary to [ denovo_dupGenes.cnv.summary ]

Analysis finished: Tue Jan  2 13:13:33 2018


@----------------------------------------------------------@
|        PLINK!       |     v1.07      |   10/Aug/2009     |
|----------------------------------------------------------|
|  (C) 2009 Shaun Purcell, GNU General Public License, v2  |
|---------------------

	--cnv-subset hg19_allenBrainGene_trimmed.txt
	--out inherited_brainGenes

Reading marker information from [ inherited.cnv.map ]
797 (of 797) markers to be included from [ inherited.cnv.map ]
Reading individual information from [ ../simplex_nofamid.ped ]
Reading pedigree information from [ ../simplex_nofamid.ped ] 
99 individuals read from [ ../simplex_nofamid.ped ] 
99 individuals with nonmissing phenotypes
Assuming a disease phenotype (0=unaff, 1=aff, other=miss)
21 cases, 78 controls and 0 missing
56 males, 43 females, and 0 of unspecified sex
Reading intersection subset list from [ hg19_allenBrainGene_trimmed.txt ]
Looking for subset of 606 ranges
Reading CNV intersection list from [ glist-hg19 ]
Read 577 ranges to intersect with CNV list
Could not find 29 ranges
Writing this list to [ inherited_brainGenes.notfound ] 

Reading segment list (CNVs) from [ inherited.cnv ]
Writing back list to intersected regions to [ inherited_brainGenes.reg ]
407 mapped to a person, of which 407 pass

21 cases, 78 controls and 0 missing
56 males, 43 females, and 0 of unspecified sex

Reading segment list (CNVs) from [ inherited.cnv ]
407 mapped to a person, of which 186 passed filters
186 of 407 mapped as valid segments
 CopyN Case/Control
     3      90 / 96

Writing per-individual summary to [ inherited_dupBurden.cnv.indiv ]
Writing positional summary to [ inherited_dupBurden.cnv.summary ]

Analysis finished: Tue Jan  2 13:13:35 2018


@----------------------------------------------------------@
|        PLINK!       |     v1.07      |   10/Aug/2009     |
|----------------------------------------------------------|
|  (C) 2009 Shaun Purcell, GNU General Public License, v2  |
|----------------------------------------------------------|
|  For documentation, citation & bug-report instructions:  |
|        http://pngu.mgh.harvard.edu/purcell/plink/        |
@----------------------------------------------------------@

Skipping web check... [ --noweb ] 
Writing this text to log file [

In [14]:
cd ~/data/cnv/xhmm
module load plink/1.07

cat ../penncnv/wellknown_region_hg19 > bad_regions.list

for cnvtype in all denovo inherited; do
    cnvname=${cnvtype}.cnv

    # remove bad regions
    plink --map ${cnvname}.map --fam ../simplex_nofamid.ped --cnv-list $cnvname \
        --noweb --1 --cnv-exclude bad_regions.list --cnv-overlap .5 \
        --cnv-write --out ${cnvtype}_clean
    plink --cnv-list ${cnvtype}_clean.cnv --cnv-make-map --noweb --1 \
        --out ${cnvtype}_clean
done


@----------------------------------------------------------@
|        PLINK!       |     v1.07      |   10/Aug/2009     |
|----------------------------------------------------------|
|  (C) 2009 Shaun Purcell, GNU General Public License, v2  |
|----------------------------------------------------------|
|  For documentation, citation & bug-report instructions:  |
|        http://pngu.mgh.harvard.edu/purcell/plink/        |
@----------------------------------------------------------@

Skipping web check... [ --noweb ] 
Writing this text to log file [ all_clean.log ]
Analysis started: Tue Jan  2 13:14:36 2018

Options in effect:
	--map all.cnv.map
	--fam ../simplex_nofamid.ped
	--cnv-list all.cnv
	--noweb
	--1
	--cnv-exclude bad_regions.list
	--cnv-overlap .5
	--cnv-write
	--out all_clean

Reading marker information from [ all.cnv.map ]
10414 (of 10414) markers to be included from [ all.cnv.map ]
Reading individual information from [ ../simplex_nofamid.ped ]
Reading pedigree information

In [15]:
cd ~/data/cnv/xhmm

for cnvtype in all denovo inherited; do
    for qc in '' '_clean'; do
        cnvname=${cnvtype}${qc}.cnv
        # apply CNV length threshold in clean and nonclean sets
        for kb in 0 500; do
            # whole burden
            plink --map ${cnvname}.map --fam ../simplex_nofamid.ped --cnv-list $cnvname \
                --noweb --1 --cnv-check-no-overlap --cnv-kb $kb \
                --out ${cnvtype}${qc}_lenBT${kb}_burden;
            # gene sets
            plink --map ${cnvname}.map --fam ../simplex_nofamid.ped --cnv-list $cnvname \
                --noweb --1 --cnv-intersect glist-hg19 --cnv-verbose-report-regions \
                --cnv-subset genes.txt --cnv-kb $kb \
                --out ${cnvtype}${qc}_lenBT${kb}_genes;
            plink --map ${cnvname}.map --fam ../simplex_nofamid.ped --cnv-list $cnvname \
                --noweb --1 --cnv-intersect glist-hg19 --cnv-verbose-report-regions \
                --cnv-subset hg19_allenBrainGene_trimmed.txt --cnv-kb $kb \
                --out ${cnvtype}${qc}_lenBT${kb}_brainGenes;
            # subtypes only
            for sub in del dup; do
                plink --map ${cnvname}.map --fam ../simplex_nofamid.ped --cnv-list $cnvname \
                --noweb --1 --cnv-${sub} --cnv-kb $kb \
                --out ${cnvtype}${qc}_lenBT${kb}_${sub}Burden;
                # gene sets
                plink --map ${cnvname}.map --fam ../simplex_nofamid.ped --cnv-list $cnvname \
                    --noweb --1 --cnv-intersect glist-hg19 --cnv-verbose-report-regions \
                    --cnv-subset genes.txt --cnv-${sub} --cnv-kb $kb\
                    --out ${cnvtype}${qc}_lenBT${kb}_${sub}Genes;
                plink --map ${cnvname}.map --fam ../simplex_nofamid.ped --cnv-list $cnvname \
                    --noweb --1 --cnv-intersect glist-hg19 --cnv-verbose-report-regions \
                    --cnv-subset hg19_allenBrainGene_trimmed.txt --cnv-${sub} \
                    --cnv-kb $kb \
                    --out ${cnvtype}${qc}_lenBT${kb}_${sub}BrainGenes;
            done;
        done;
    done;
done


@----------------------------------------------------------@
|        PLINK!       |     v1.07      |   10/Aug/2009     |
|----------------------------------------------------------|
|  (C) 2009 Shaun Purcell, GNU General Public License, v2  |
|----------------------------------------------------------|
|  For documentation, citation & bug-report instructions:  |
|        http://pngu.mgh.harvard.edu/purcell/plink/        |
@----------------------------------------------------------@

Skipping web check... [ --noweb ] 
Writing this text to log file [ all_lenBT0_burden.log ]
Analysis started: Tue Jan  2 13:16:22 2018

Options in effect:
	--map all.cnv.map
	--fam ../simplex_nofamid.ped
	--cnv-list all.cnv
	--noweb
	--1
	--cnv-check-no-overlap
	--cnv-kb 0
	--out all_lenBT0_burden

Reading marker information from [ all.cnv.map ]
10414 (of 10414) markers to be included from [ all.cnv.map ]
Reading individual information from [ ../simplex_nofamid.ped ]
Reading pedigree information from [ ../

	--cnv-del
	--cnv-kb 0
	--out all_lenBT0_delGenes

Reading marker information from [ all.cnv.map ]
10414 (of 10414) markers to be included from [ all.cnv.map ]
Reading individual information from [ ../simplex_nofamid.ped ]
Reading pedigree information from [ ../simplex_nofamid.ped ] 
99 individuals read from [ ../simplex_nofamid.ped ] 
99 individuals with nonmissing phenotypes
Assuming a disease phenotype (0=unaff, 1=aff, other=miss)
21 cases, 78 controls and 0 missing
56 males, 43 females, and 0 of unspecified sex
Reading intersection subset list from [ genes.txt ]
Looking for subset of 25303 ranges
Reading CNV intersection list from [ glist-hg19 ]
Read 25646 ranges to intersect with CNV list

Reading segment list (CNVs) from [ all.cnv ]
Writing back list to intersected regions to [ all_lenBT0_delGenes.reg ]
7294 mapped to a person, of which 3616 passed filters
3111 intersected with one or more specified region
3111 of 7294 mapped as valid segments
 CopyN Case/Control
     1  1552 / 1

21 cases, 78 controls and 0 missing
56 males, 43 females, and 0 of unspecified sex
Reading intersection subset list from [ hg19_allenBrainGene_trimmed.txt ]
Looking for subset of 606 ranges
Reading CNV intersection list from [ glist-hg19 ]
Read 577 ranges to intersect with CNV list
Could not find 29 ranges
Writing this list to [ all_lenBT0_dupBrainGenes.notfound ] 

Reading segment list (CNVs) from [ all.cnv ]
Writing back list to intersected regions to [ all_lenBT0_dupBrainGenes.reg ]
7294 mapped to a person, of which 3678 passed filters
35 intersected with one or more specified region
35 of 7294 mapped as valid segments
 CopyN Case/Control
     3      14 / 21

Writing per-individual summary to [ all_lenBT0_dupBrainGenes.cnv.indiv ]
Writing positional summary to [ all_lenBT0_dupBrainGenes.cnv.summary ]

Analysis finished: Tue Jan  2 13:16:39 2018


@----------------------------------------------------------@
|        PLINK!       |     v1.07      |   10/Aug/2009     |
|---------------


Analysis finished: Tue Jan  2 13:16:40 2018


@----------------------------------------------------------@
|        PLINK!       |     v1.07      |   10/Aug/2009     |
|----------------------------------------------------------|
|  (C) 2009 Shaun Purcell, GNU General Public License, v2  |
|----------------------------------------------------------|
|  For documentation, citation & bug-report instructions:  |
|        http://pngu.mgh.harvard.edu/purcell/plink/        |
@----------------------------------------------------------@

Skipping web check... [ --noweb ] 
Writing this text to log file [ all_lenBT500_delGenes.log ]
Analysis started: Tue Jan  2 13:16:40 2018

Options in effect:
	--map all.cnv.map
	--fam ../simplex_nofamid.ped
	--cnv-list all.cnv
	--noweb
	--1
	--cnv-intersect glist-hg19
	--cnv-verbose-report-regions
	--cnv-subset genes.txt
	--cnv-del
	--cnv-kb 500
	--out all_lenBT500_delGenes

Reading marker information from [ all.cnv.map ]
10414 (of 10414) markers to be include

@----------------------------------------------------------@

Skipping web check... [ --noweb ] 
Writing this text to log file [ all_lenBT500_dupBrainGenes.log ]
Analysis started: Tue Jan  2 13:16:42 2018

Options in effect:
	--map all.cnv.map
	--fam ../simplex_nofamid.ped
	--cnv-list all.cnv
	--noweb
	--1
	--cnv-intersect glist-hg19
	--cnv-verbose-report-regions
	--cnv-subset hg19_allenBrainGene_trimmed.txt
	--cnv-dup
	--cnv-kb 500
	--out all_lenBT500_dupBrainGenes

Reading marker information from [ all.cnv.map ]
10414 (of 10414) markers to be included from [ all.cnv.map ]
Reading individual information from [ ../simplex_nofamid.ped ]
Reading pedigree information from [ ../simplex_nofamid.ped ] 
99 individuals read from [ ../simplex_nofamid.ped ] 
99 individuals with nonmissing phenotypes
Assuming a disease phenotype (0=unaff, 1=aff, other=miss)
21 cases, 78 controls and 0 missing
56 males, 43 females, and 0 of unspecified sex
Reading intersection subset list from [ hg19_allenBrainGen

Analysis started: Tue Jan  2 13:16:42 2018

Options in effect:
	--map all_clean.cnv.map
	--fam ../simplex_nofamid.ped
	--cnv-list all_clean.cnv
	--noweb
	--1
	--cnv-del
	--cnv-kb 0
	--out all_clean_lenBT0_delBurden

Reading marker information from [ all_clean.cnv.map ]
456 (of 456) markers to be included from [ all_clean.cnv.map ]
Reading individual information from [ ../simplex_nofamid.ped ]
Reading pedigree information from [ ../simplex_nofamid.ped ] 
99 individuals read from [ ../simplex_nofamid.ped ] 
99 individuals with nonmissing phenotypes
Assuming a disease phenotype (0=unaff, 1=aff, other=miss)
21 cases, 78 controls and 0 missing
56 males, 43 females, and 0 of unspecified sex

Reading segment list (CNVs) from [ all_clean.cnv ]
185 mapped to a person, of which 107 passed filters
107 of 185 mapped as valid segments
 CopyN Case/Control
     1      84 / 23

Writing per-individual summary to [ all_clean_lenBT0_delBurden.cnv.indiv ]
Writing positional summary to [ all_clean_lenBT0_d

Assuming a disease phenotype (0=unaff, 1=aff, other=miss)
21 cases, 78 controls and 0 missing
56 males, 43 females, and 0 of unspecified sex
Reading intersection subset list from [ genes.txt ]
Looking for subset of 25303 ranges
Reading CNV intersection list from [ glist-hg19 ]
Read 25646 ranges to intersect with CNV list

Reading segment list (CNVs) from [ all_clean.cnv ]
Writing back list to intersected regions to [ all_clean_lenBT0_dupGenes.reg ]
185 mapped to a person, of which 78 passed filters
75 intersected with one or more specified region
75 of 185 mapped as valid segments
 CopyN Case/Control
     3      28 / 47

Writing per-individual summary to [ all_clean_lenBT0_dupGenes.cnv.indiv ]
Writing positional summary to [ all_clean_lenBT0_dupGenes.cnv.summary ]

Analysis finished: Tue Jan  2 13:16:43 2018


@----------------------------------------------------------@
|        PLINK!       |     v1.07      |   10/Aug/2009     |
|-------------------------------------------------------

Looking for subset of 606 ranges
Reading CNV intersection list from [ glist-hg19 ]
Read 577 ranges to intersect with CNV list
Could not find 29 ranges
Writing this list to [ all_clean_lenBT500_brainGenes.notfound ] 

Reading segment list (CNVs) from [ all_clean.cnv ]
Writing back list to intersected regions to [ all_clean_lenBT500_brainGenes.reg ]
185 mapped to a person, of which 9 passed filters
2 intersected with one or more specified region
2 of 185 mapped as valid segments
 CopyN Case/Control
     1        2 / 0

Writing per-individual summary to [ all_clean_lenBT500_brainGenes.cnv.indiv ]
Writing positional summary to [ all_clean_lenBT500_brainGenes.cnv.summary ]

Analysis finished: Tue Jan  2 13:16:44 2018


@----------------------------------------------------------@
|        PLINK!       |     v1.07      |   10/Aug/2009     |
|----------------------------------------------------------|
|  (C) 2009 Shaun Purcell, GNU General Public License, v2  |
|-------------------------------

Writing positional summary to [ all_clean_lenBT500_dupBurden.cnv.summary ]

Analysis finished: Tue Jan  2 13:16:44 2018


@----------------------------------------------------------@
|        PLINK!       |     v1.07      |   10/Aug/2009     |
|----------------------------------------------------------|
|  (C) 2009 Shaun Purcell, GNU General Public License, v2  |
|----------------------------------------------------------|
|  For documentation, citation & bug-report instructions:  |
|        http://pngu.mgh.harvard.edu/purcell/plink/        |
@----------------------------------------------------------@

Skipping web check... [ --noweb ] 
Writing this text to log file [ all_clean_lenBT500_dupGenes.log ]
Analysis started: Tue Jan  2 13:16:44 2018

Options in effect:
	--map all_clean.cnv.map
	--fam ../simplex_nofamid.ped
	--cnv-list all_clean.cnv
	--noweb
	--1
	--cnv-intersect glist-hg19
	--cnv-verbose-report-regions
	--cnv-subset genes.txt
	--cnv-dup
	--cnv-kb 500
	--out all_clean_lenBT5

|  (C) 2009 Shaun Purcell, GNU General Public License, v2  |
|----------------------------------------------------------|
|  For documentation, citation & bug-report instructions:  |
|        http://pngu.mgh.harvard.edu/purcell/plink/        |
@----------------------------------------------------------@

Skipping web check... [ --noweb ] 
Writing this text to log file [ denovo_lenBT0_brainGenes.log ]
Analysis started: Tue Jan  2 13:16:45 2018

Options in effect:
	--map denovo.cnv.map
	--fam ../simplex_nofamid.ped
	--cnv-list denovo.cnv
	--noweb
	--1
	--cnv-intersect glist-hg19
	--cnv-verbose-report-regions
	--cnv-subset hg19_allenBrainGene_trimmed.txt
	--cnv-kb 0
	--out denovo_lenBT0_brainGenes

Reading marker information from [ denovo.cnv.map ]
1843 (of 1843) markers to be included from [ denovo.cnv.map ]
Reading individual information from [ ../simplex_nofamid.ped ]
Reading pedigree information from [ ../simplex_nofamid.ped ] 
99 individuals read from [ ../simplex_nofamid.ped ] 
99 i

@----------------------------------------------------------@

Skipping web check... [ --noweb ] 
Writing this text to log file [ denovo_lenBT0_dupBurden.log ]
Analysis started: Tue Jan  2 13:16:46 2018

Options in effect:
	--map denovo.cnv.map
	--fam ../simplex_nofamid.ped
	--cnv-list denovo.cnv
	--noweb
	--1
	--cnv-dup
	--cnv-kb 0
	--out denovo_lenBT0_dupBurden

Reading marker information from [ denovo.cnv.map ]
1843 (of 1843) markers to be included from [ denovo.cnv.map ]
Reading individual information from [ ../simplex_nofamid.ped ]
Reading pedigree information from [ ../simplex_nofamid.ped ] 
99 individuals read from [ ../simplex_nofamid.ped ] 
99 individuals with nonmissing phenotypes
Assuming a disease phenotype (0=unaff, 1=aff, other=miss)
21 cases, 78 controls and 0 missing
56 males, 43 females, and 0 of unspecified sex

Reading segment list (CNVs) from [ denovo.cnv ]
701 mapped to a person, of which 293 passed filters
293 of 701 mapped as valid segments
 CopyN Case/Control
   

99 individuals with nonmissing phenotypes
Assuming a disease phenotype (0=unaff, 1=aff, other=miss)
21 cases, 78 controls and 0 missing
56 males, 43 females, and 0 of unspecified sex
Reading intersection subset list from [ genes.txt ]
Looking for subset of 25303 ranges
Reading CNV intersection list from [ glist-hg19 ]
Read 25646 ranges to intersect with CNV list

Reading segment list (CNVs) from [ denovo.cnv ]
Writing back list to intersected regions to [ denovo_lenBT500_genes.reg ]
701 mapped to a person, of which 2 passed filters
2 intersected with one or more specified region
2 of 701 mapped as valid segments
 CopyN Case/Control
     1        1 / 0
     3        0 / 1

Writing per-individual summary to [ denovo_lenBT500_genes.cnv.indiv ]
Writing positional summary to [ denovo_lenBT500_genes.cnv.summary ]

Analysis finished: Tue Jan  2 13:16:47 2018


@----------------------------------------------------------@
|        PLINK!       |     v1.07      |   10/Aug/2009     |
|-----------

Read 577 ranges to intersect with CNV list
Could not find 29 ranges
Writing this list to [ denovo_lenBT500_delBrainGenes.notfound ] 

Reading segment list (CNVs) from [ denovo.cnv ]
Writing back list to intersected regions to [ denovo_lenBT500_delBrainGenes.reg ]
701 mapped to a person, of which 1 passed filters
0 intersected with one or more specified region
0 of 701 mapped as valid segments
 CopyN Case/Control

Writing per-individual summary to [ denovo_lenBT500_delBrainGenes.cnv.indiv ]
Writing positional summary to [ denovo_lenBT500_delBrainGenes.cnv.summary ]

Analysis finished: Tue Jan  2 13:16:48 2018


@----------------------------------------------------------@
|        PLINK!       |     v1.07      |   10/Aug/2009     |
|----------------------------------------------------------|
|  (C) 2009 Shaun Purcell, GNU General Public License, v2  |
|----------------------------------------------------------|
|  For documentation, citation & bug-report instructions:  |
|        http://

|        PLINK!       |     v1.07      |   10/Aug/2009     |
|----------------------------------------------------------|
|  (C) 2009 Shaun Purcell, GNU General Public License, v2  |
|----------------------------------------------------------|
|  For documentation, citation & bug-report instructions:  |
|        http://pngu.mgh.harvard.edu/purcell/plink/        |
@----------------------------------------------------------@

Skipping web check... [ --noweb ] 
Writing this text to log file [ denovo_clean_lenBT0_genes.log ]
Analysis started: Tue Jan  2 13:16:48 2018

Options in effect:
	--map denovo_clean.cnv.map
	--fam ../simplex_nofamid.ped
	--cnv-list denovo_clean.cnv
	--noweb
	--1
	--cnv-intersect glist-hg19
	--cnv-verbose-report-regions
	--cnv-subset genes.txt
	--cnv-kb 0
	--out denovo_clean_lenBT0_genes

Reading marker information from [ denovo_clean.cnv.map ]
151 (of 151) markers to be included from [ denovo_clean.cnv.map ]
Reading individual information from [ ../simplex_nofamid.p

|  For documentation, citation & bug-report instructions:  |
|        http://pngu.mgh.harvard.edu/purcell/plink/        |
@----------------------------------------------------------@

Skipping web check... [ --noweb ] 
Writing this text to log file [ denovo_clean_lenBT0_delBrainGenes.log ]
Analysis started: Tue Jan  2 13:16:49 2018

Options in effect:
	--map denovo_clean.cnv.map
	--fam ../simplex_nofamid.ped
	--cnv-list denovo_clean.cnv
	--noweb
	--1
	--cnv-intersect glist-hg19
	--cnv-verbose-report-regions
	--cnv-subset hg19_allenBrainGene_trimmed.txt
	--cnv-del
	--cnv-kb 0
	--out denovo_clean_lenBT0_delBrainGenes

Reading marker information from [ denovo_clean.cnv.map ]
151 (of 151) markers to be included from [ denovo_clean.cnv.map ]
Reading individual information from [ ../simplex_nofamid.ped ]
Reading pedigree information from [ ../simplex_nofamid.ped ] 
99 individuals read from [ ../simplex_nofamid.ped ] 
99 individuals with nonmissing phenotypes
Assuming a disease phenotype (0=u

|        http://pngu.mgh.harvard.edu/purcell/plink/        |
@----------------------------------------------------------@

Skipping web check... [ --noweb ] 
Writing this text to log file [ denovo_clean_lenBT500_burden.log ]
Analysis started: Tue Jan  2 13:16:49 2018

Options in effect:
	--map denovo_clean.cnv.map
	--fam ../simplex_nofamid.ped
	--cnv-list denovo_clean.cnv
	--noweb
	--1
	--cnv-check-no-overlap
	--cnv-kb 500
	--out denovo_clean_lenBT500_burden

Reading marker information from [ denovo_clean.cnv.map ]
151 (of 151) markers to be included from [ denovo_clean.cnv.map ]
Reading individual information from [ ../simplex_nofamid.ped ]
Reading pedigree information from [ ../simplex_nofamid.ped ] 
99 individuals read from [ ../simplex_nofamid.ped ] 
99 individuals with nonmissing phenotypes
Assuming a disease phenotype (0=unaff, 1=aff, other=miss)
21 cases, 78 controls and 0 missing
56 males, 43 females, and 0 of unspecified sex

Reading segment list (CNVs) from [ denovo_clean.cnv

	--out denovo_clean_lenBT500_delGenes

Reading marker information from [ denovo_clean.cnv.map ]
151 (of 151) markers to be included from [ denovo_clean.cnv.map ]
Reading individual information from [ ../simplex_nofamid.ped ]
Reading pedigree information from [ ../simplex_nofamid.ped ] 
99 individuals read from [ ../simplex_nofamid.ped ] 
99 individuals with nonmissing phenotypes
Assuming a disease phenotype (0=unaff, 1=aff, other=miss)
21 cases, 78 controls and 0 missing
56 males, 43 females, and 0 of unspecified sex
Reading intersection subset list from [ genes.txt ]
Looking for subset of 25303 ranges
Reading CNV intersection list from [ glist-hg19 ]
Read 25646 ranges to intersect with CNV list

Reading segment list (CNVs) from [ denovo_clean.cnv ]
Writing back list to intersected regions to [ denovo_clean_lenBT500_delGenes.reg ]
53 mapped to a person, of which 1 passed filters
1 intersected with one or more specified region
1 of 53 mapped as valid segments
 CopyN Case/Control
     1 

151 (of 151) markers to be included from [ denovo_clean.cnv.map ]
Reading individual information from [ ../simplex_nofamid.ped ]
Reading pedigree information from [ ../simplex_nofamid.ped ] 
99 individuals read from [ ../simplex_nofamid.ped ] 
99 individuals with nonmissing phenotypes
Assuming a disease phenotype (0=unaff, 1=aff, other=miss)
21 cases, 78 controls and 0 missing
56 males, 43 females, and 0 of unspecified sex
Reading intersection subset list from [ hg19_allenBrainGene_trimmed.txt ]
Looking for subset of 606 ranges
Reading CNV intersection list from [ glist-hg19 ]
Read 577 ranges to intersect with CNV list
Could not find 29 ranges
Writing this list to [ denovo_clean_lenBT500_dupBrainGenes.notfound ] 

Reading segment list (CNVs) from [ denovo_clean.cnv ]
Writing back list to intersected regions to [ denovo_clean_lenBT500_dupBrainGenes.reg ]
53 mapped to a person, of which 1 passed filters
0 intersected with one or more specified region
0 of 53 mapped as valid segments
 Cop

99 individuals read from [ ../simplex_nofamid.ped ] 
99 individuals with nonmissing phenotypes
Assuming a disease phenotype (0=unaff, 1=aff, other=miss)
21 cases, 78 controls and 0 missing
56 males, 43 females, and 0 of unspecified sex

Reading segment list (CNVs) from [ inherited.cnv ]
407 mapped to a person, of which 221 passed filters
221 of 407 mapped as valid segments
 CopyN Case/Control
     1    100 / 121

Writing per-individual summary to [ inherited_lenBT0_delBurden.cnv.indiv ]
Writing positional summary to [ inherited_lenBT0_delBurden.cnv.summary ]

Analysis finished: Tue Jan  2 13:16:51 2018


@----------------------------------------------------------@
|        PLINK!       |     v1.07      |   10/Aug/2009     |
|----------------------------------------------------------|
|  (C) 2009 Shaun Purcell, GNU General Public License, v2  |
|----------------------------------------------------------|
|  For documentation, citation & bug-report instructions:  |
|        http://pngu.m

145 intersected with one or more specified region
145 of 407 mapped as valid segments
 CopyN Case/Control
     3      71 / 74

Writing per-individual summary to [ inherited_lenBT0_dupGenes.cnv.indiv ]
Writing positional summary to [ inherited_lenBT0_dupGenes.cnv.summary ]

Analysis finished: Tue Jan  2 13:16:52 2018


@----------------------------------------------------------@
|        PLINK!       |     v1.07      |   10/Aug/2009     |
|----------------------------------------------------------|
|  (C) 2009 Shaun Purcell, GNU General Public License, v2  |
|----------------------------------------------------------|
|  For documentation, citation & bug-report instructions:  |
|        http://pngu.mgh.harvard.edu/purcell/plink/        |
@----------------------------------------------------------@

Skipping web check... [ --noweb ] 
Writing this text to log file [ inherited_lenBT0_dupBrainGenes.log ]
Analysis started: Tue Jan  2 13:16:52 2018

Options in effect:
	--map inherited.cnv.map

 CopyN Case/Control

Writing per-individual summary to [ inherited_lenBT500_brainGenes.cnv.indiv ]
Writing positional summary to [ inherited_lenBT500_brainGenes.cnv.summary ]

Analysis finished: Tue Jan  2 13:16:53 2018


@----------------------------------------------------------@
|        PLINK!       |     v1.07      |   10/Aug/2009     |
|----------------------------------------------------------|
|  (C) 2009 Shaun Purcell, GNU General Public License, v2  |
|----------------------------------------------------------|
|  For documentation, citation & bug-report instructions:  |
|        http://pngu.mgh.harvard.edu/purcell/plink/        |
@----------------------------------------------------------@

Skipping web check... [ --noweb ] 
Writing this text to log file [ inherited_lenBT500_delBurden.log ]
Analysis started: Tue Jan  2 13:16:53 2018

Options in effect:
	--map inherited.cnv.map
	--fam ../simplex_nofamid.ped
	--cnv-list inherited.cnv
	--noweb
	--1
	--cnv-del
	--cnv-kb 500
	--o

@----------------------------------------------------------@

Skipping web check... [ --noweb ] 
Writing this text to log file [ inherited_lenBT500_dupGenes.log ]
Analysis started: Tue Jan  2 13:16:53 2018

Options in effect:
	--map inherited.cnv.map
	--fam ../simplex_nofamid.ped
	--cnv-list inherited.cnv
	--noweb
	--1
	--cnv-intersect glist-hg19
	--cnv-verbose-report-regions
	--cnv-subset genes.txt
	--cnv-dup
	--cnv-kb 500
	--out inherited_lenBT500_dupGenes

Reading marker information from [ inherited.cnv.map ]
797 (of 797) markers to be included from [ inherited.cnv.map ]
Reading individual information from [ ../simplex_nofamid.ped ]
Reading pedigree information from [ ../simplex_nofamid.ped ] 
99 individuals read from [ ../simplex_nofamid.ped ] 
99 individuals with nonmissing phenotypes
Assuming a disease phenotype (0=unaff, 1=aff, other=miss)
21 cases, 78 controls and 0 missing
56 males, 43 females, and 0 of unspecified sex
Reading intersection subset list from [ genes.txt ]
Lookin

Analysis started: Tue Jan  2 13:16:54 2018

Options in effect:
	--map inherited_clean.cnv.map
	--fam ../simplex_nofamid.ped
	--cnv-list inherited_clean.cnv
	--noweb
	--1
	--cnv-intersect glist-hg19
	--cnv-verbose-report-regions
	--cnv-subset hg19_allenBrainGene_trimmed.txt
	--cnv-kb 0
	--out inherited_clean_lenBT0_brainGenes

Reading marker information from [ inherited_clean.cnv.map ]
15 (of 15) markers to be included from [ inherited_clean.cnv.map ]
Reading individual information from [ ../simplex_nofamid.ped ]
Reading pedigree information from [ ../simplex_nofamid.ped ] 
99 individuals read from [ ../simplex_nofamid.ped ] 
99 individuals with nonmissing phenotypes
Assuming a disease phenotype (0=unaff, 1=aff, other=miss)
21 cases, 78 controls and 0 missing
56 males, 43 females, and 0 of unspecified sex
Reading intersection subset list from [ hg19_allenBrainGene_trimmed.txt ]
Looking for subset of 606 ranges
Reading CNV intersection list from [ glist-hg19 ]
Read 577 ranges to intersec

Analysis started: Tue Jan  2 13:16:54 2018

Options in effect:
	--map inherited_clean.cnv.map
	--fam ../simplex_nofamid.ped
	--cnv-list inherited_clean.cnv
	--noweb
	--1
	--cnv-dup
	--cnv-kb 0
	--out inherited_clean_lenBT0_dupBurden

Reading marker information from [ inherited_clean.cnv.map ]
15 (of 15) markers to be included from [ inherited_clean.cnv.map ]
Reading individual information from [ ../simplex_nofamid.ped ]
Reading pedigree information from [ ../simplex_nofamid.ped ] 
99 individuals read from [ ../simplex_nofamid.ped ] 
99 individuals with nonmissing phenotypes
Assuming a disease phenotype (0=unaff, 1=aff, other=miss)
21 cases, 78 controls and 0 missing
56 males, 43 females, and 0 of unspecified sex

Reading segment list (CNVs) from [ inherited_clean.cnv ]
5 mapped to a person, of which 1 passed filters
1 of 5 mapped as valid segments
 CopyN Case/Control
     3        1 / 0

Writing per-individual summary to [ inherited_clean_lenBT0_dupBurden.cnv.indiv ]
Writing positional

Reading individual information from [ ../simplex_nofamid.ped ]
Reading pedigree information from [ ../simplex_nofamid.ped ] 
99 individuals read from [ ../simplex_nofamid.ped ] 
99 individuals with nonmissing phenotypes
Assuming a disease phenotype (0=unaff, 1=aff, other=miss)
21 cases, 78 controls and 0 missing
56 males, 43 females, and 0 of unspecified sex
Reading intersection subset list from [ genes.txt ]
Looking for subset of 25303 ranges
Reading CNV intersection list from [ glist-hg19 ]
Read 25646 ranges to intersect with CNV list

Reading segment list (CNVs) from [ inherited_clean.cnv ]
Writing back list to intersected regions to [ inherited_clean_lenBT500_genes.reg ]
5 mapped to a person, of which 1 passed filters
1 intersected with one or more specified region
1 of 5 mapped as valid segments
 CopyN Case/Control
     1        0 / 1

Writing per-individual summary to [ inherited_clean_lenBT500_genes.cnv.indiv ]
Writing positional summary to [ inherited_clean_lenBT500_genes.cnv.s

Reading pedigree information from [ ../simplex_nofamid.ped ] 
99 individuals read from [ ../simplex_nofamid.ped ] 
99 individuals with nonmissing phenotypes
Assuming a disease phenotype (0=unaff, 1=aff, other=miss)
21 cases, 78 controls and 0 missing
56 males, 43 females, and 0 of unspecified sex
Reading intersection subset list from [ hg19_allenBrainGene_trimmed.txt ]
Looking for subset of 606 ranges
Reading CNV intersection list from [ glist-hg19 ]
Read 577 ranges to intersect with CNV list
Could not find 29 ranges
Writing this list to [ inherited_clean_lenBT500_delBrainGenes.notfound ] 

Reading segment list (CNVs) from [ inherited_clean.cnv ]
Writing back list to intersected regions to [ inherited_clean_lenBT500_delBrainGenes.reg ]
5 mapped to a person, of which 1 passed filters
0 intersected with one or more specified region
0 of 5 mapped as valid segments
 CopyN Case/Control

Writing per-individual summary to [ inherited_clean_lenBT500_delBrainGenes.cnv.indiv ]
Writing positional

Now let's do some plotting in Python.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True)
%matplotlib inline

In [2]:
# figure out who is who in each trio
import glob
ped_file = '/data/sudregp/cnv/simplex.ped'
wes_prefix = ['CLIA', 'CCGO', 'WPS']
trios = []
affected = []
controls = []
samples = []
famids = []
fid = open(ped_file, 'r')
for line in fid:
    famid, sid, fa, mo, sex, aff = line.rstrip().split('\t')
    if fa[:4] in wes_prefix and mo[:4] in wes_prefix and sid[:4] in wes_prefix:
        fam = {}
        fam['child'] = sid
        if aff == '1':
            affected.append(sid)
        else:
            controls.append(sid)
        fam['father'] = fa
        fam['mother'] = mo
        fam['famid'] = famid
        trios.append(fam)
        samples += [sid, fa, mo]
        famids.append(famid)
fid.close()
samples = set(samples)
famids = set(famids)

fid = open('/data/sudregp/cnv/kid_samples.txt', 'r')
good_kids = [line.rstrip() for line in fid]
fid.close()

In [3]:
def plot_plink_cnvs(fname, t_str, verbose=False):
    
    import plotly.graph_objs as go
    from plotly import tools

    df = pd.read_table(fname, delimiter='\s+', index_col=1)
    df.head()

    x_red, x_green, red, green, xticks = [], [], [], [], []
    red_text, green_text = [], []

    # loop through families
    f = 0
    for fam in famids:
        fam_kids = [t['child'] for t in trios if t['famid'] == fam]
        found = False
        for kid in fam_kids:
            if kid in good_kids:
                found = True
                idx = [i for i in range(df.shape[0]) if df.index[i].find(kid) >= 0][0]
                if kid in affected:
                    red.append(int(df.iloc[idx]['NSEG']))
                    x_red.append(f)
                    red_text.append(kid)
                else:
                    green.append(int(df.iloc[idx]['NSEG']))
                    x_green.append(f)
                    green_text.append(kid)
        # only increase counter if we added a kid
        if found:
            xticks.append(fam)
            f += 1

    fig = go.Figure()
                
    trace0 = go.Scatter(
        x = x_red,
        y = red,
        mode = 'markers',
        name = 'affected',
        marker = dict(size = 10, color = 'red'),
        text = red_text,
        hoverinfo='text+y',
        showlegend = True
    )
    trace1 = go.Scatter(
        x = x_green,
        y = green,
        mode = 'markers',
        name = 'unaffected',
        marker = dict(size = 10, color = 'green'),
        hovertext = green_text,
        hoverinfo='text+y',
        showlegend = True
    )
    fig['data'] = [trace0, trace1]
    fig['layout'].update(height=400, width=800, title=t_str,
                             xaxis1=dict(tickvals=range(len(xticks)),
                                        ticktext=xticks,
                                        zeroline = False),
                             hovermode='closest')
    iplot(fig)
    
    # print family order
    if verbose:
        score = []
        for x in range(max(x_green + x_red)):
            idx = [i for i, val in enumerate(x_red) if val == x]
            if len(idx) == 0:
                aff = 0
            else:
                aff = red[idx[0]]
            idx = [i for i, val in enumerate(x_green) if val == x]
            if len(idx) == 0:
                unaff = 0
            else:
                unaff = max([green[i] for i in idx])
            score.append(aff - unaff)
        order = np.argsort(score)[::-1]
        fam_names = fig.layout.xaxis1['ticktext']
        disrupted = [fam_names[v] for v in order if score[v] > 0]
        print 'Best disrupted:', ', '.join(disrupted)
        protected = [fam_names[v] for v in order[::-1] if score[v] < 0]
        print 'Best protected:', ', '.join(protected)

In [4]:
import glob

data_dir = '/home/sudregp/data/cnv/xhmm/'
files = glob.glob(data_dir + '/denovo*cnv.indiv')
for f in files:
    t_str = '.'.join(f.split('/')[-1].split('.')[:-2])
    plot_plink_cnvs(f, t_str, verbose=True)

Best disrupted: 10406, 10042, 10090, 10033, 10173, 1893, 1895, 10178, 10197, 10164, 10182, 10128
Best protected: 10094, 10131, 10041, 10448, 855, 10215, 10369, 10153


Best disrupted: 10406, 10042, 10033, 10153, 10369, 1893, 10448, 10173, 10197
Best protected: 10094, 855, 10128, 10041, 10090, 10131, 10182, 10215, 1895


Best disrupted: 10090, 10042, 10173, 1895, 10128, 855, 10033, 10178, 10182, 1893, 10197, 10164
Best protected: 10131, 10448, 10369, 10153, 10041, 10215, 10406, 10094


Best disrupted: 10406, 10042, 10090, 10033, 10173, 10197, 1893, 10128, 1895, 10178, 10182, 10164, 10369
Best protected: 10094, 10131, 10448, 10041, 855, 10215, 1892, 10153


Best disrupted: 10406, 10197
Best protected: 10041, 10215


Best disrupted: 10406, 10042, 10153, 10369, 10033, 10448, 10173, 10197, 1893, 10131
Best protected: 10094, 855, 10041, 10090, 10182, 10215, 1895, 1892, 10128


Best disrupted: 10406
Best protected: 10215


Best disrupted: 10090, 10042, 10173, 855, 10033, 10128, 1895, 10182, 10197, 10178, 1893, 10164
Best protected: 10131, 10448, 10153, 10369, 10041, 10215, 10094


Best disrupted: 10197
Best protected: 10041


Best disrupted: 10406, 10042, 10090, 10033, 10173, 1893, 1895, 10178, 10197, 10164, 10182, 10128
Best protected: 10094, 10131, 10041, 10448, 855, 10215, 10369, 10153


Best disrupted: 10406, 10042, 10090, 10033, 10173, 10197, 1893, 10128, 1895, 10178, 10182, 10164, 10369
Best protected: 10094, 10131, 10448, 10041, 855, 10215, 1892, 10153


Best disrupted: 10406, 10197
Best protected: 10041, 10215


Best disrupted: 10406, 10042, 10033, 10153, 10369, 1893, 10448, 10173, 10197
Best protected: 10094, 855, 10128, 10041, 10090, 10131, 10182, 10215, 1895


Best disrupted: 10406, 10042, 10153, 10369, 10033, 10448, 10173, 10197, 1893, 10131
Best protected: 10094, 855, 10041, 10090, 10182, 10215, 1895, 1892, 10128


Best disrupted: 10406
Best protected: 10215


Best disrupted: 10090, 10042, 10173, 1895, 10128, 855, 10033, 10178, 10182, 1893, 10197, 10164
Best protected: 10131, 10448, 10369, 10153, 10041, 10215, 10406, 10094


Best disrupted: 10090, 10042, 10173, 855, 10033, 10128, 1895, 10182, 10197, 10178, 1893, 10164
Best protected: 10131, 10448, 10153, 10369, 10041, 10215, 10094


Best disrupted: 10197
Best protected: 10041


Best disrupted: 10406
Best protected: 10369


Best disrupted: 10406
Best protected: 10369


Best disrupted: 
Best protected: 


Best disrupted: 10406
Best protected: 


Best disrupted: 10406
Best protected: 


Best disrupted: 
Best protected: 


Best disrupted: 
Best protected: 10369


Best disrupted: 
Best protected: 10369


Best disrupted: 
Best protected: 


Best disrupted: 10406, 10090, 10164, 10173
Best protected: 10131, 10369, 1895


Best disrupted: 10406, 10164, 10090, 10173
Best protected: 10131, 1895


Best disrupted: 
Best protected: 


Best disrupted: 10406, 10369, 10448
Best protected: 1895


Best disrupted: 10406, 10369, 10448
Best protected: 1895


Best disrupted: 
Best protected: 


Best disrupted: 10090, 10164, 10173
Best protected: 10369, 10448, 10131


Best disrupted: 10090, 10164, 10173
Best protected: 10369, 10448, 10131


Best disrupted: 
Best protected: 


Best disrupted: 10406
Best protected: 10369


Best disrupted: 10406
Best protected: 10369


Best disrupted: 
Best protected: 


Best disrupted: 10406
Best protected: 


Best disrupted: 10406
Best protected: 


Best disrupted: 
Best protected: 


Best disrupted: 
Best protected: 10369


Best disrupted: 
Best protected: 10369


Best disrupted: 
Best protected: 


As usual, cleaning does help a lot. We still have some weird stuff going on for family 10406, which is catching a lot of deletions. But there might be come cool stuff for 10090 and 10164. Let's take a look at the genes:

In [1]:
%%bash
cd /home/sudregp/data/cnv/xhmm/

for kid in CLIA_400122 CLIA_400123 CLIA_400178 CLIA_400149; do
    echo $kid
    grep -B 3 $kid denovo_clean_lenBT0_genes.reg | grep RANGE;
done

CLIA_400122
RANGE (+/- 0kb )  [ 2 99235568 99347589 MGAT4A ]
RANGE (+/- 0kb )  [ 2 174771186 174830430 SP3 ]
RANGE (+/- 0kb )  [ 5 172483354 172566291 CREBRF ]
CLIA_400123
RANGE (+/- 0kb )  [ 4 17812435 17846487 NCAPG ]
RANGE (+/- 0kb )  [ 4 17844838 18023483 LCORL ]
RANGE (+/- 0kb )  [ 12 22778075 22843608 ETNK1 ]
RANGE (+/- 0kb )  [ 13 108870762 108886603 ABHD13 ]
CLIA_400178
RANGE (+/- 0kb )  [ 1 62901974 62917475 USP1 ]
RANGE (+/- 0kb )  [ 1 62920396 63154039 DOCK7 ]
RANGE (+/- 0kb )  [ 1 67390577 67454302 MIER1 ]
RANGE (+/- 0kb )  [ 1 71512188 71532865 ZRANB2-AS1 ]
RANGE (+/- 0kb )  [ 1 71528973 71546972 ZRANB2 ]
RANGE (+/- 0kb )  [ 1 71533313 71533399 MIR186 ]
RANGE (+/- 0kb )  [ 1 71547006 71703406 ZRANB2-AS2 ]
RANGE (+/- 0kb )  [ 1 71868624 72748277 NEGR1 ]
RANGE (+/- 0kb )  [ 1 93811477 93828148 DR1 ]
RANGE (+/- 0kb )  [ 2 48010220 48034092 MSH6 ]
RANGE (+/- 0kb )  [ 2 48034058 48132932 FBXO11 ]
RANGE (+/- 0kb )  [ 2 174771186 174830430 SP3 ]
RANGE (+/- 0kb )  [ 2 201390864 20

In [2]:
%%bash
cd /home/sudregp/data/cnv/xhmm/

for kid in CCGO_800980 CLIA_400205 CLIA_400178; do
    echo $kid
    grep -B 3 $kid denovo_clean_lenBT0_delGenes.reg | grep RANGE;
done

CCGO_800980
RANGE (+/- 0kb )  [ X 80369199 80457441 HMGN5 ]
RANGE (+/- 0kb )  [ X 101615315 101694929 NXF2 ]
CLIA_400205
RANGE (+/- 0kb )  [ 8 30689059 30706533 TEX15 ]
CLIA_400178
RANGE (+/- 0kb )  [ 1 62901974 62917475 USP1 ]
RANGE (+/- 0kb )  [ 1 62920396 63154039 DOCK7 ]
RANGE (+/- 0kb )  [ 1 67390577 67454302 MIER1 ]
RANGE (+/- 0kb )  [ 1 71512188 71532865 ZRANB2-AS1 ]
RANGE (+/- 0kb )  [ 1 71528973 71546972 ZRANB2 ]
RANGE (+/- 0kb )  [ 1 71533313 71533399 MIR186 ]
RANGE (+/- 0kb )  [ 1 71547006 71703406 ZRANB2-AS2 ]
RANGE (+/- 0kb )  [ 1 71868624 72748277 NEGR1 ]
RANGE (+/- 0kb )  [ 1 93811477 93828148 DR1 ]
RANGE (+/- 0kb )  [ 2 48010220 48034092 MSH6 ]
RANGE (+/- 0kb )  [ 2 48034058 48132932 FBXO11 ]
RANGE (+/- 0kb )  [ 2 174771186 174830430 SP3 ]
RANGE (+/- 0kb )  [ 2 201390864 201448818 SGOL2 ]
RANGE (+/- 0kb )  [ 3 42642146 42690233 NKTR ]
RANGE (+/- 0kb )  [ 3 180319917 180336135 TTC14 ]
RANGE (+/- 0kb )  [ 3 180331795 180397283 CCDC39 ]
RANGE (+/- 0kb )  [ 4 17812435 17846

# Gene-based analysis

Let's see if there are any specific genes we should look at, before we do any cleaning:

In [7]:
fid = open('/home/sudregp/data/cnv/xhmm/denovo_lenBT0_genes.reg')
genes = {}
for line in fid:
    if line.find('RANGE') >= 0:
        gene = line.split(' ')[-2]
        genes[gene] = 0
    elif line.find('DUP') > 0 or line.find('DEL') > 0:
        genes[gene] += 1
fid.close()
df = pd.DataFrame.from_dict(genes, orient='index')
df.columns = ['count']
df.sort_values(by='count', ascending=False).head(10)

Unnamed: 0,count
GOLGA6L10,7
NBPF20,7
NBPF9,7
GOLGA6L9,6
MIR548N,6
GOLGA6L17P,6
TTN,5
GSTTP1,5
GSTTP2,5
RNU6-76P,5


So, these genes are hit by CNVs in 5 to 7 kids, out of the 49 trios we have. And these are all denovo results, which means that it'd be nice if any of those all belong to affected kids. Or maybe even a combination would work? How about some sort of deletion is bad, duplication is good scheme?

In [8]:
dfs = df.sort_values(by='count', ascending=False).head(10)
for index, count in dfs.iterrows():
    nlines = count[0]
    gene = index
    fid = open('/home/sudregp/data/cnv/xhmm/denovo_lenBT0_genes.reg')
    for line in fid:
        if line.find(gene) >= 0:
            print line.rstrip()
            for i in range(nlines + 1):
                line = fid.next()
                print line.rstrip()
    fid.close()

RANGE (+/- 0kb )  [ 15 82632349 83018198 GOLGA6L10 ]
 FID           IID      PHE  CHR          BP1          BP2   TYPE       KB     OLAP   OLAP_U   OLAP_R
   1   CLIA_400144        1   15     82611934     82664578    DEL    52.64   0.6122  0.07933  0.08353
   1   CLIA_400181        1   15     82917384     83015629    DEL    98.25        1   0.2546   0.2546
   1   CLIA_400210        1   15     82917384     83041282    DEL    123.9   0.8137   0.2465   0.2613
   1   CLIA_400129        2   15     82917384     83041282    DEL    123.9   0.8137   0.2465   0.2613
   1   CLIA_400175        1   15     83011424     83041282    DUP    29.86   0.2269  0.01657  0.01756
   1   CLIA_400144        1   15     83011424     83041282    DUP    29.86   0.2269  0.01657  0.01756
   1   CLIA_400130        2   15     83011424     83041282    DUP    29.86   0.2269  0.01657  0.01756
RANGE (+/- 0kb )  [ 1 144146810 146467744 NBPF20 ]
 FID           IID      PHE  CHR          BP1          BP2   TYPE       KB     O

In PLINK notation, 1 is unaffected, 2 is affected. So, nothing really there. What if we look at the clean set only?

In [9]:
fid = open('/home/sudregp/data/cnv/xhmm/denovo_clean_lenBT0_genes.reg')
genes = {}
for line in fid:
    if line.find('RANGE') >= 0:
        gene = line.split(' ')[-2]
        genes[gene] = 0
    elif line.find('DUP') > 0 or line.find('DEL') > 0:
        genes[gene] += 1
fid.close()
df = pd.DataFrame.from_dict(genes, orient='index')
df.columns = ['count']
df.sort_values(by='count', ascending=False).head(10)

Unnamed: 0,count
ABHD13,4
LCORL,2
SP3,2
CREBRF,2
CHD9,2
NXF2,2
ETNK1,2
NCAPG,2
DR1,1
BCLAF1,1


In [10]:
dfs = df.sort_values(by='count', ascending=False).head(7)
for index, count in dfs.iterrows():
    nlines = count[0]
    gene = index
    fid = open('/home/sudregp/data/cnv/xhmm/denovo_clean_lenBT0_genes.reg')
    for line in fid:
        if line.find(gene) >= 0:
            print line.rstrip()
            for i in range(nlines + 1):
                line = fid.next()
                print line.rstrip()
    fid.close()

RANGE (+/- 0kb )  [ 13 108870762 108886603 ABHD13 ]
 FID           IID      PHE  CHR          BP1          BP2   TYPE       KB     OLAP   OLAP_U   OLAP_R
   1   CLIA_400158        1   13    108882417    108886344    DEL    3.927        1   0.2479   0.2479
   1   CLIA_400123        2   13    108882665    108885500    DUP    2.835        1    0.179    0.179
   1   CLIA_400178        2   13    108882844    108886540    DEL    3.696        1   0.2334   0.2334
   1   CCGO_800979        1   13    108884227    108884732    DUP    0.505        1  0.03194  0.03194
RANGE (+/- 0kb )  [ 4 17844838 18023483 LCORL ]
 FID           IID      PHE  CHR          BP1          BP2   TYPE       KB     OLAP   OLAP_U   OLAP_R
   1   CLIA_400123        2    4     17845860     17879030    DUP    33.17        1   0.1857   0.1857
   1   CLIA_400178        2    4     17845860     17879761    DEL     33.9        1   0.1898   0.1898
RANGE (+/- 0kb )  [ 2 174771186 174830430 SP3 ]
 FID           IID      PHE  CHR    

Now this becomes a bit more interesting. Especially if we talk about combinations of genes. Say, you're screwed if you have two (or more) disrupted of the genes in this list. In that case, 400122 has 2, 400123 has 3, 400178 has 6. Does it vary with symptoms?

What if we do a similar analysis in all genes (not only denovo)?

In [11]:
fid = open('/home/sudregp/data/cnv/xhmm/all_lenBT0_genes.reg')
genes = {}
for line in fid:
    if line.find('RANGE') >= 0:
        gene = line.split(' ')[-2]
        genes[gene] = 0
    elif line.find('DUP') > 0 or line.find('DEL') > 0:
        genes[gene] += 1
fid.close()
df = pd.DataFrame.from_dict(genes, orient='index')
df.columns = ['count']
df.sort_values(by='count', ascending=False).head(10)

Unnamed: 0,count
NBPF20,81
NBPF9,63
FCGBP,56
MIR548N,49
AHNAK2,45
GOLGA6L22,45
PCDHA2,44
PCDHA3,44
PCDHA1,44
PCDHA7,44


Now our counts go up to 99, because we're looking at all individuals. Still, some of those counts look quite high. Let's use the clean set instead:

In [12]:
fid = open('/home/sudregp/data/cnv/xhmm/all_clean_lenBT0_genes.reg')
genes = {}
for line in fid:
    if line.find('RANGE') >= 0:
        gene = line.split(' ')[-2]
        genes[gene] = 0
    elif line.find('DUP') > 0 or line.find('DEL') > 0:
        genes[gene] += 1
fid.close()
df = pd.DataFrame.from_dict(genes, orient='index')
df.columns = ['count']
df.sort_values(by='count', ascending=False).head(10)

Unnamed: 0,count
NXF2,9
ABHD13,8
SP3,6
MIER1,5
THAP5,5
MAGEA6,5
CCDC168,5
PNPLA8,5
LCORL,4
LYSMD3,4


This is better, but it's still out of 99. Anything comes out as interesting? (let's print only the kids)

In [13]:
dfs = df.sort_values(by='count', ascending=False).head(7)
for index, count in dfs.iterrows():
    nlines = count[0]
    gene = index
    fid = open('/home/sudregp/data/cnv/xhmm/all_clean_lenBT0_genes.reg')
    for line in fid:
        if line.find(gene) >= 0:
            print line.rstrip()
            for i in range(nlines + 1):
                line = fid.next()
                if line.find('FID') < 0:
                    sample = line.split('   ')[2]
                    if sample in affected + controls:
                        print line.rstrip()
                else:
                    print line.rstrip()
    fid.close()

RANGE (+/- 0kb )  [ X 101615315 101694929 NXF2 ]
 FID           IID      PHE  CHR          BP1          BP2   TYPE       KB     OLAP   OLAP_U   OLAP_R
   1   CLIA_400138        1   23    101615646    101620272    DUP    4.626        1  0.05812  0.05812
   1   CLIA_400128        2   23    101615646    101620272    DEL    4.626        1  0.05812  0.05812
   1   CLIA_400122        2   23    101615646    101620272    DUP    4.626        1  0.05812  0.05812
   1   CLIA_400149        2   23    101615646    101620272    DUP    4.626        1  0.05812  0.05812
   1   CLIA_400129        2   23    101615646    101620272    DEL    4.626        1  0.05812  0.05812
   1   CCGO_800980        2   23    101615646    101620272    DEL    4.626        1  0.05812  0.05812
   1   CCGO_800979        1   23    101615646    101620272    DUP    4.626        1  0.05812  0.05812
   1   CLIA_400142        1   23    101619868    101620272    DUP    0.404        1 0.005087 0.005087
   1   CLIA_400131        2   23 

CCDC could be something. Or we could look into the combinations deal as well.

# TODO

* Apply CNV length cleaning
* Play with quality knob.
* Do gene-based analysis parsing the .reg file.
* Follow QC rules from XHMM papers 
* Look into sex chromosomes? Something to the idea that adhd is more present in boys...
* Play with the HMM parameters
* do population frequency filtering first?
* Include parent burden in the analysis
* Match with file of ranked simplex by Wendy (maybe blindly)?
* Try other WES CNV callers?