It's very weird to me that the results in xhmm_clean and xhmm_clean2 are so different, and we just removed 2 subjects in them. Let's see what could be causing it:

In [6]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True)
%matplotlib inline

In [7]:
import glob
ped_files = ['/data/sudregp/multiplex_simplex/simplex.ped']
wes_prefix = ['CLIA', 'CCGO', 'WPS']
# fid = open('/home/sudregp/data/multiplex_simplex/samples_simplex_all.txt', 'r')
# exclude_list = [line.rstrip() for line in fid]
# fid.close()

# no controls/affected pair for comparison
exclude_list = ['CLIA_400165', 'CLIA_400164', 'CLIA_400155', 'CLIA_400146',
                'CLIA_400145', 'CLIA_400126', 'CLIA_400207', 'CLIA_400208',
                'CLIA_400209']
# missing one parent
exclude_list += ['CLIA_400169', 'CLIA_400168']
# family 9030
exclude_list += ['CCGO_800978', 'CCGO_800977', 'CCGO_800976', 'CCGO_800979',
                 'CCGO_800980', 'CLIA_400067']

trios = []
affected = []
controls = []
samples = []
famids = []
for ped_file in ped_files:
    fid = open(ped_file, 'r')
    for line in fid:
        famid, sid, fa, mo, sex, aff = line.rstrip().split('\t')
        # if the current ID and its parents have WES data, and the sample is 
        # not in yet
        if (fa.split('_')[0] in wes_prefix and
            mo.split('_')[0] in wes_prefix and
            sid.split('_')[0] in wes_prefix and
            sid not in samples and
            (sid not in exclude_list or fa not in exclude_list or mo not in exclude_list)):
            fam = {}
            fam['child'] = sid
            if aff == '1':
                affected.append(sid)
            else:
                controls.append(sid)
            fam['father'] = fa
            fam['mother'] = mo
            fam['famid'] = famid
            trios.append(fam)
            samples += [sid, fa, mo]
            famids.append(famid)
    fid.close()
samples = set(samples)
famids = set(famids)
kids = set(affected + controls)

print 'Unique samples:', len(samples)
print 'Unique families:', len(famids)
print 'Unique children:', len(kids)

Unique samples: 84
Unique families: 19
Unique children: 46


In [8]:
samples84 = samples.copy()
kids84 = kids.copy()

In [9]:
import glob
ped_file = '/data/sudregp/cnv/simplex.ped'
wes_prefix = ['CLIA', 'CCGO', 'WPS']

# cleaning up the data for this analysis
fid = open('/home/sudregp/data/cnv/exclude.txt', 'r')
exclude_list = [line.rstrip() for line in fid]
fid.close()
fid = open('/home/sudregp/data/cnv/exclude_multikids.txt', 'r')
exclude_list += [line.rstrip() for line in fid]
fid.close()

trios = []
affected = []
controls = []
samples = []
famids = []
fid = open(ped_file, 'r')
for line in fid:
    famid, sid, fa, mo, sex, aff = line.rstrip().split('\t')
    if (fa[:4] in wes_prefix and mo[:4] in wes_prefix and
        sid[:4] in wes_prefix and sid not in exclude_list):
        fam = {}
        fam['child'] = sid
        if aff == '1':
            affected.append(sid)
        else:
            controls.append(sid)
        fam['father'] = fa
        fam['mother'] = mo
        fam['famid'] = famid
        trios.append(fam)
        samples += [sid, fa, mo]
        famids.append(famid)
fid.close()
samples = set(samples)
famids = set(famids)
good_kids = [t['child'] for t in trios]

print 'Working with %d samples, in %d families' % (len(samples), len(famids))

Working with 88 samples, in 20 families


In [10]:
samples88 = samples.copy()
kids88 = kids.copy()
print [s for s in samples84 if s not in samples88]
print [s for s in samples88 if s not in samples84]

[]
['CCGO_800979', 'CCGO_800976', 'CCGO_800977', 'CCGO_800980']


OK, so everyone in 84 is in 88, and we're removing the 4 subjects we thought we were. Can we reproduce the difference in results?

In [19]:
fid = open('/data/sudregp/cnv/tmp88/samples.txt', 'w')
for s in samples88:
    fid.write(s + '\n')
fid.close()
fid = open('/data/sudregp/cnv/tmp88/kid_samples.txt', 'w')
for s in kids88:
    fid.write(s + '\n')
fid.close()

fid = open('/data/sudregp/cnv/tmp84/samples.txt', 'w')
for s in samples84:
    fid.write(s + '\n')
fid.close()
fid = open('/data/sudregp/cnv/tmp84/kid_samples.txt', 'w')
for s in kids84:
    fid.write(s + '\n')
fid.close()

In [None]:
# in terminal for both tmp84 and tmp88

exome_targets='/data/NCR_SBRB/simplex/SeqCapEZ_Exome_v3.0_Design_Annotation_files/SeqCap_EZ_Exome_v3_hg19_capture_targets.bed'
gatk_memory="50g"
ref_fa='/fdb/GATK_resource_bundle/hg19-2.8/ucsc.hg19.fasta'
out_dir='/data/sudregp/cnv/tmp84'

cd $out_dir
module load GATK
module load XHMM

GATK -m ${gatk_memory} GCContentByInterval -L ${exome_targets} -R ${ref_fa} -o ./DATA.locus_GC.txt
cat ./DATA.locus_GC.txt | awk '{if ($2 < 0.1 || $2 > 0.9) print $1}' > ./extreme_gc_targets.txt

# merging all subjects in the directory
while read s; do
    cp ../xhmm_clean/${s}* .
done < samples.txt
ls -1 *.sample_interval_summary > depth_list.txt;
cp ../xhmm_clean/params.txt .

xhmm --mergeGATKdepths --GATKdepthsList=depth_list.txt -o ./DATA.RD.txt;

# this does the same thing as the XHMM script, but it actually works in parsing 
# the base pair start and ends
cat ${exome_targets} | awk 'BEGIN{OFS="\t"; print "#CHR\tBP1\tBP2\tID"}{print $1, $2, $3, NR}' > ./EXOME.targets.reg

module load plinkseq
pseq . loc-load --locdb ./EXOME.targets.LOCDB --file ./EXOME.targets.reg --group targets \
    --out ./EXOME.targets.LOCDB.loc-load --noweb

# this has the same effect as the suggested command, but it actually works
pseq . loc-stats --locdb ./EXOME.targets.LOCDB --group targets --seqdb ./seqdb.hg19 --noweb | \
    awk '{if (NR > 1) { print  $4, $10 }}' | sed 's/\.\./-/' - > ./DATA.locus_complexity.txt

cat ./DATA.locus_complexity.txt | awk '{if ($2 > 0.25) print $1}' > ./low_complexity_targets.txt

xhmm --matrix -r ./DATA.RD.txt --centerData --centerType target \
-o ./DATA.filtered_centered.RD.txt \
--outputExcludedTargets ./DATA.filtered_centered.RD.txt.filtered_targets.txt \
--outputExcludedSamples ./DATA.filtered_centered.RD.txt.filtered_samples.txt \
--excludeTargets ./extreme_gc_targets.txt --excludeTargets ./low_complexity_targets.txt \
--minTargetSize 10 --maxTargetSize 10000 \
--minMeanTargetRD 10 --maxMeanTargetRD 500 \
--minMeanSampleRD 25 --maxMeanSampleRD 200 \
--maxSdSampleRD 150

xhmm --PCA -r ./DATA.filtered_centered.RD.txt --PCAfiles ./DATA.RD_PCA

xhmm --normalize -r ./DATA.filtered_centered.RD.txt --PCAfiles ./DATA.RD_PCA \
--normalizeOutput ./DATA.PCA_normalized.txt \
--PCnormalizeMethod PVE_mean --PVE_mean_factor 0.7

xhmm --matrix -r ./DATA.PCA_normalized.txt --centerData --centerType sample --zScoreData \
-o ./DATA.PCA_normalized.filtered.sample_zscores.RD.txt \
--outputExcludedTargets ./DATA.PCA_normalized.filtered.sample_zscores.RD.txt.filtered_targets.txt \
--outputExcludedSamples ./DATA.PCA_normalized.filtered.sample_zscores.RD.txt.filtered_samples.txt \
--maxSdTargetRD 30

xhmm --matrix -r ./DATA.RD.txt \
--excludeTargets ./DATA.filtered_centered.RD.txt.filtered_targets.txt \
--excludeTargets ./DATA.PCA_normalized.filtered.sample_zscores.RD.txt.filtered_targets.txt \
--excludeSamples ./DATA.filtered_centered.RD.txt.filtered_samples.txt \
--excludeSamples ./DATA.PCA_normalized.filtered.sample_zscores.RD.txt.filtered_samples.txt \
-o ./DATA.same_filtered.RD.txt

xhmm --discover -p params.txt -r ./DATA.PCA_normalized.filtered.sample_zscores.RD.txt \
    -R ./DATA.same_filtered.RD.txt -c ./DATA.xcnv -a ./DATA.aux_xcnv -s ./DATA

xhmm --genotype -p params.txt -r ./DATA.PCA_normalized.filtered.sample_zscores.RD.txt \
    -R ./DATA.same_filtered.RD.txt -g ./DATA.xcnv -F $ref_fa -v ./DATA.vcf



Now we create PSEQ and PLINK files for both datasets.

In [None]:
#in terminal

pseq DATA new-project
# adding a first column with subject ID for PSEQ
cut -f 2 ../simplex.ped > junk.txt
paste junk.txt ../simplex.ped > simplex.ped.info
pseq DATA load-pedigree --file simplex.ped.info
pseq DATA load-vcf --vcf DATA.vcf

for q in 50 60 70 80 90; do
    pseq DATA cnv-denovo --noweb --minSQ $q --minNQ $q --out DATA_q${q}
    grep DENOVO DATA_q${q}.denovo.cnv > pseq_DENOVO.txt
    # borrow the header row
    head -1 DATA.xcnv > denovo.xcnv;

    # filter out denovo CNVs
    while read sample; do
        grep $sample DATA.xcnv > sample.xcnv;
        for cnv in `grep $sample pseq_DENOVO.txt | cut -f 3 -`; do
            # replacing .. by -
            cnv=`echo $cnv | sed -e 's/\.\./\-/'`;
            grep $cnv sample.xcnv >> denovo.xcnv; 
        done;
    done < kid_samples.txt;
    /usr/local/apps/XHMM/2016-01-04/sources/scripts/xcnv_to_cnv denovo.xcnv > tmp.cnv
    # switch around FAMID and IID columns, and remove header
    awk '{OFS="\t"; if ( $3 != "CHR" ) {print $2, $1, $3, $4, $5, $6, $7, $8 }}' tmp.cnv > denovo_q${q}.cnv
    rm sample.xcnv pseq_DENOVO.txt tmp.cnv denovo.xcnv
    
    # filter out inherited cnvs
    grep MATERNAL_TRANSMITTED DATA_q${q}.denovo.cnv > pseq_TRANSMITTED.txt
    grep PATERNAL_TRANSMITTED DATA_q${q}.denovo.cnv >> pseq_TRANSMITTED.txt
    # borrow the header row
    head -1 DATA.xcnv > inherited.xcnv;

    while read sample; do
        grep $sample DATA.xcnv > sample.xcnv;
        for cnv in `grep $sample pseq_TRANSMITTED.txt | cut -f 3 -`; do
            # replacing .. by -
            cnv=`echo $cnv | sed -e 's/\.\./\-/'`;
            grep $cnv sample.xcnv >> inherited.xcnv; 
        done;
    done < kid_samples.txt;
    /usr/local/apps/XHMM/2016-01-04/sources/scripts/xcnv_to_cnv inherited.xcnv > tmp.cnv
    # switch around FAMID and IID columns, and remove header
    awk '{OFS="\t"; if ( $3 != "CHR" ) {print $2, $1, $3, $4, $5, $6, $7, $8 }}' tmp.cnv > inherited_q${q}.cnv
    rm sample.xcnv pseq_TRANSMITTED.txt tmp.cnv inherited.xcnv
    
    # compile all CNVs for kids
    # borrow the header row
    head -1 DATA.xcnv > all.xcnv;

    # effectively just filtering DATA.xcnv to keep only kids
    while read sample; do
        grep $sample DATA.xcnv >> all.xcnv;
    done < kid_samples.txt;
    /usr/local/apps/XHMM/2016-01-04/sources/scripts/xcnv_to_cnv all.xcnv > tmp.cnv
    # switch around FAMID and IID columns, and remove header
    awk '{OFS="\t"; if ( $3 != "CHR" ) {print $2, $1, $3, $4, $5, $6, $7, $8 }}' tmp.cnv > all_q${q}.cnv
    rm tmp.cnv all.xcnv
done

In [2]:
%%bash
grep 178 ~/data/cnv/tmp88/denovo_q60.cnv | wc -l
grep 178 ~/data/cnv/tmp84/denovo_q60.cnv | wc -l
grep 178 ~/data/cnv/xhmm_clean2/denovo_q60.cnv | wc -l
grep 178 ~/data/cnv/xhmm_clean/denovo_q60.cnv | wc -l

166
30
30
168


In [3]:
%%bash
grep 178 ~/data/cnv/tmp88/denovo_q70.cnv | wc -l
grep 178 ~/data/cnv/tmp84/denovo_q70.cnv | wc -l
grep 178 ~/data/cnv/xhmm_clean2/denovo_q70.cnv | wc -l
grep 178 ~/data/cnv/xhmm_clean/denovo_q70.cnv | wc -l

122
17
17
123


In [4]:
%%bash
grep 178 ~/data/cnv/tmp88/denovo_q80.cnv | wc -l
grep 178 ~/data/cnv/tmp84/denovo_q80.cnv | wc -l
grep 178 ~/data/cnv/xhmm_clean2/denovo_q80.cnv | wc -l
grep 178 ~/data/cnv/xhmm_clean/denovo_q80.cnv | wc -l

66
8
8
67


Yeah, it's quite concerning. Let's see if any of the other methods are as sensitive to the samples involved. 

In [11]:
samples84

{'CLIA_400121',
 'CLIA_400122',
 'CLIA_400123',
 'CLIA_400125',
 'CLIA_400127',
 'CLIA_400128',
 'CLIA_400129',
 'CLIA_400130',
 'CLIA_400131',
 'CLIA_400132',
 'CLIA_400133',
 'CLIA_400134',
 'CLIA_400135',
 'CLIA_400136',
 'CLIA_400137',
 'CLIA_400138',
 'CLIA_400139',
 'CLIA_400140',
 'CLIA_400141',
 'CLIA_400142',
 'CLIA_400143',
 'CLIA_400144',
 'CLIA_400147',
 'CLIA_400148',
 'CLIA_400149',
 'CLIA_400150',
 'CLIA_400151',
 'CLIA_400152',
 'CLIA_400153',
 'CLIA_400154',
 'CLIA_400156',
 'CLIA_400157',
 'CLIA_400158',
 'CLIA_400159',
 'CLIA_400160',
 'CLIA_400161',
 'CLIA_400162',
 'CLIA_400163',
 'CLIA_400166',
 'CLIA_400167',
 'CLIA_400170',
 'CLIA_400171',
 'CLIA_400172',
 'CLIA_400173',
 'CLIA_400174',
 'CLIA_400175',
 'CLIA_400176',
 'CLIA_400177',
 'CLIA_400178',
 'CLIA_400179',
 'CLIA_400180',
 'CLIA_400181',
 'CLIA_400182',
 'CLIA_400183',
 'CLIA_400184',
 'CLIA_400185',
 'CLIA_400186',
 'CLIA_400187',
 'CLIA_400188',
 'CLIA_400189',
 'CLIA_400190',
 'CLIA_400191',
 'CLIA_4

One interesting thing here is that the 84 analysis only contains CLIA samples, as we ended up removing all CCGO samples. It could be a coincidence, but maybe not. In any case, let's look at other methods.

It does look like the DOC of CCGO samples is generally higher than CLIA... but shouldn't that get autocorrected by XHMM?