In [1]:
import warnings
warnings.filterwarnings('ignore')

%load_ext rpy2.ipython

%run /gfs/devel/ddanko/notebook-init.py

# load R libraries                                                                                    
%R invisible(library(ggplot2))
%R invisible(library(fastcluster))
%R invisible(library(reshape))
%R invisible(library(reshape2))
%R invisible(library(gplots))
%R invisible(library(RSQLite))

#set up ggplot2 defaults                                                                              
%R theme_set(theme_gray(base_size=18))

%pylab inline

pylab.rcParams['figure.figsize'] = (10, 6)


froot = '/gfs/work/ddanko/taxonomic_profiler_comparison/analysis/actual/'

Populating the interactive namespace from numpy and matplotlib


In [12]:
import subprocess
import gzip

def pAlignedDmnd(seqfile,alignfile):
    nseqs = countSeqs(seqfile)
    naligned = countAligned(alignfile)
    return float(naligned) / nseqs
    
def countSeqs(seqfile):
    cmd = "zcat {} | wc -l".format(seqfile)
    out = subprocess.check_output(cmd,shell=True)
    return int(out) / 4

def countAligned(alignfile):
    aligned = {}
    with gzip.open(alignfile) as af:
        for line in af:
            qid = line.split()[0]
            if qid in aligned
            aligned[qid] = True
    return len(aligned)


In [18]:
seqfile = '/gfs/work/ddanko/taxonomic_profiler_comparison/analysis/actual/g_pig/gpigfecal-dna-R495409.fastq.gz'
alignfile = '/gfs/work/ddanko/taxonomic_profiler_comparison/analysis/actual/g_pig/acdmnd.dir/gpigfecal-dna-R495409.acdmnd.tsv.gz'

In [16]:
nseqs = countSeqs(seqfile)
nseqs

19165522

In [19]:
naligned = countAligned(alignfile)
naligned

3659247

In [21]:
float(naligned)/nseqs

0.1909286373728824

In [23]:
import gzip

def countKrakenClassified(kfile):
    nclass = 0
    with gzip.open(kfile) as kf:
        for line in kf:
            if line.strip()[0] == 'C':
                nclass += 1
    return nclass

In [24]:
kfile = '/gfs/work/ddanko/taxonomic_profiler_comparison/analysis/actual/g_pig/kraken.dir/gpigfecal-dna-R495409.classified.tsv.gz'

nkraken = countKrakenClassified(kfile)
nkraken

1144232

In [25]:
float(nkraken)/nseqs

0.059702626414245334

In [33]:
import os
import glob

def findClassFiles(seqfile):
    root = os.path.dirname(seqfile)
    base = os.path.basename(seqfile)
    base = base[:base.index('.fastq.gz')]
    
    adFile = '{}/acdmnd.dir/{}.acdmnd.tsv.gz'.format(root,base)
    adfFile = '{}/acdmndfast.dir/{}.acdmndfast.tsv.gz'.format(root,base)
    kFile = '{}/kraken.dir/{}.classified.tsv.gz'.format(root,base)
    
    files = {}
    if os.path.isfile(adFile):
        files['acdmnd'] = adFile
    if os.path.isfile(adfFile):
        files['acdmndfast'] = adfFile
    if os.path.isfile(kFile):
        files['kraken'] = kFile
        
    return files

def processClassFiles(seqfile,classfiles):
    counts = {}
    counts['nseqs'] = countSeqs(seqfile)
    for tool, classfile in classfiles.items():
        if tool in ('acdmnd','acdmndfast'):
            counts[tool] = countAligned(classfile)
        elif tool == 'kraken':
            counts[tool] = countKrakenClassified(classfile)
    return counts

def findSet(fastqglob):
    fastqs = glob.glob(fastqglob)
    allfiles = {}
    for fastq in fastqs:
        base = os.path.basename(fastq)
        base = base[:base.index('.fastq.gz')]
        classfiles = findClassFiles(fastq)
        counts = processClassFiles(fastq,classfiles)
        allfiles[base] = counts
    return allfiles






In [None]:
fastqglob = '/gfs/work/ddanko/taxonomic_profiler_comparison/analysis/actual/*/*.fastq.gz'
allfiles = findSet(fastqglob)
allfiles

In [None]:

import numpy as np
import pandas as pd

df = pd.DataFrame(allfiles)
df = df.transpose()
df


In [None]:
100 * df.div(df.nseqs,axis='index')