# _ipyrad_ testing tutorial

### Getting started
Import _ipyrad_ and remove previous test files if they are already present

In [1]:
## import modules
import ipyrad as ip      ## for RADseq assembly
print ip.__version__     ## print version

## clear data from test directory if it already exists
import shutil
import os
import subprocess
#if os.path.exists("./test_refseq/"):
#    shutil.rmtree("./test_refseq/")

0.0.65


In [2]:
## This is useful during development since IPython
## seems to want to re-use old .pyc files, 
## though even this doesn't always work...
import IPython.lib.deepreload
import __builtin__
from IPython.lib import deepreload
__builtin__.reload = deepreload.reload

### Initialize smalt (index reference sequence)
This is preparation for indexing. It only ever needs to be done once so shoud be tested during initialization.

`smalt index zf-ref ../zf/zf.sm.fa`

There is an optional -s flag that could improve mapping accuracy. Consider the best default, probably not worth letting people pass it in, if they want to mess with it they can index their own reference.

In [3]:
# hack the binary paths cuz the current egg doesn't have them in it
#data1.muscle=
#data1.vsearch
#data1.smalt

# Reference sequence directory (gzipped fasta files)
# TODO: set this as a parameter
# e.g., data1.set_params('refseq', "./data/zf.fa.gz")

# TODO: push this example file to the data/ dir 
REFSEQ = "./data/zf.fa.gz"

# Set the step size to 4 (default is 13)
# This will slow down read mapping, but increase accuracy
SMALT_INDEX_FLAGS = " -s 4 "

# TODO: create and link a dir/ to the Assembly object for the reference data files
data1.dirs.reference = '...'

# TODO: create and link index files to Sample objects
data1.samples['1A_0'].files.index_smi = '...'
data1.samples['1A_0'].files.index_sma = '...'

# Test if reference sequence is already indexed
# Only index if the .smi and .sma files don't exist, saves lots of time
if not os.path.isfile( REFSEQ+".smi" ):
    # smalt indexing will create two files called REFSEQ.smi and .sma
    # in the same directory as the reference sequence. 
    cmd = data1.smalt + " index " + SMALT_INDEX_FLAGS + REFSEQ + " " + REFSEQ
    print cmd
    subprocess.check_call(cmd, shell=True,
                            stderr=subprocess.STDOUT,
                            stdout=subprocess.PIPE)
    #output = subprocess.check_output( " ".join(cmd), shell=True)
else:
    print "Reference sequence index exists"

KeyError: '1A_0'

### Assembly and Sample objects

Assembly and Sample objects are used by _ipyrad_ to access data stored on disk and to manipulate it. Each biological sample in a data set is represented in a Sample object, and these Samples are stored inside Assembly objects. The Assembly object contains functions to assemble the data, and stores a log of all steps performed and the resulting statistics of those steps. Assembly objects can be copied or merged to allow branching events where different parameters are applied to assemblies. 

To create an Assembly object call ip.Assembly and pass it a name for the data set. We could imagine that we planned to assemble and later combine data from multiple sequencing runs, but before combining them each group of samples has to be analyzed under a different set of parameters. As an example, we could call two data sets "2014_data" and "2015_data". These initially do not contain any Samples. Sample objects are created either by linking fastq files to the Assembly object or by running step 1 to demultiplex raw data files. 

In [3]:
## create an Assembly object called data1. 
## It takes an 'test'
data1 = ip.Assembly("2014_data")
data2 = ip.Assembly("2015_data")

print "Assembly object named", data1.name
print "Assembly object named", data2.name


[]
0 new Samples created in 2014_data.
0 fastq files linked to Samples.
[]
0 new Samples created in 2015_data.
0 fastq files linked to Samples.
Assembly object named 2014_data
Assembly object named 2015_data


### Modifying assembly parameters
All of the parameter settings are linked to an Assembly object, which has a set of default parameters when it is created. These can be viewed using the `get_params()` function. To get more detailed information about all paramteres use `ip.get_params_info()` or to select a single parameter use `ip.get_params_info(3)`. Assembly objects have a function `set_params()` that can be used to modify parameters. 

In [4]:
## modify parameters for this Assembly object
data1.set_params(1, "./test_refseq")
data1.set_params(2, "./data/sim_rad_test_R1_.fastq.gz")
data1.set_params(3, "./data/sim_rad_test_barcodes.txt")
data1.set_params(4, "./test/test_refseq/2014_data_fastqs/")
data1.set_params(7, 3)
data1.set_params(10, 'rad')
data1.set_params(28, '/Volumes/WorkDrive/ipyrad/refhacking/MusChr1.fa')

## print the new parameters to screen
data1.get_params()

[]
0 new Samples created in 2014_data.
0 fastq files linked to Samples.
  1   working_directory             ./test_refseq                                
  2   raw_fastq_path                ./data/sim_rad_test_R1_.fastq.gz             
  3   barcodes_path                 ./data/sim_rad_test_barcodes.txt             
  4   sorted_fastq_path             ./test/test_refseq/2014_data_fastqs/         
  5   restriction_overhang          ('TGCAG', '')                                
  6   max_low_qual_bases            5                                            
  7   N_processors                  3                                            
  8   mindepth_statistical          6                                            
  9   mindepth_majrule              6                                            
  10  datatype                      rad                                          
  11  clust_threshold               0.85                                         
  12  minsamp             

### Starting data assembly and Sample objects
If the data are already demultiplexed then fastq files can be linked directly to the Data object, which in turn will create Sample objects for each fastq file (or pair of fastq files for paired data). The files may be gzip compressed. If the data are not demultiplexed then you will have to run the step1 function below to demultiplex the raw data.

In [68]:
## This would link fastq files from the 'sorted_fastq_path' if present
## Here it does nothing b/c there are no files in the sorted_fastq_path
data1.link_fastqs()

[]
0 new Samples created in 2014_data.
0 fastq files linked to Samples.


### Step 1: Demultiplex the raw data files
This uses the barcodes information to demultiplex reads in data files found in the 'raw_fastq_path'. It will create a Sample object for each sample that will be stored in the Assembly object. The state of each sample will be set to 1, meaning that the sample has completed step 1 of the _ipyrad_ assembly.

In [6]:
## run step 1 to demultiplex the data
data1.step1(preview=1)

## print the results for each Sample in data1
print data1.stats


samples already found in 2014_data use ip.merge() to combine samples 
from multipleAssembly objects
      state  reads_raw
1A_0      1      20099
1B_0      1      19977
1C_0      1      20114
1D_0      1      19895
2E_0      1      19928
2F_0      1      19934
2G_0      1      20026
2H_0      1      19936
3I_0      1      20084
3J_0      1      20011
3K_0      1      20117
3L_0      1      19901


### Step 2: Filter reads 
If for some reason we wanted to execute on just a subsample of our data, we could do this by selecting only certain samples to call the `step2` function on. Because `step2` is a function of `data`, it will always execute with the parameters that are linked to `data`. 

In [6]:
## run step 1 to demultiplex the data
data1.step1()

## print the results for each Sample in data1
print data1.stats
wat = data1.stats

samples already found in 2014_data use ip.merge() to combine samples 
from multipleAssembly objects


TypeError: drop() got an unexpected keyword argument 'how'

### Step 2: Filter reads 
If for some reason we wanted to execute on just a subsample of our data, we could do this by selecting only certain samples to call the `step2` function on. Because `step2` is a function of `data`, it will always execute with the parameters that are linked to `data`. 

In [71]:
## example of ways to run step 2 to filter and trim reads
#data1.step2("1A_0")            ## run on a single sample
data1.step2(["1B_0", "1C_0"])  ## run on one or more samples
#data1.step2()                  ## run on all samples, skipping finished ones

## print the results
print data1.stats

      state  reads_raw  reads_filtered  clusters_total  clusters_kept  \
1A_0      1      20099             NaN             NaN            NaN   
1B_0      2      19977           19977             NaN            NaN   
1C_0      2      20114           20114             NaN            NaN   
1D_0      1      19895             NaN             NaN            NaN   
2E_0      1      19928             NaN             NaN            NaN   
2F_0      1      19934             NaN             NaN            NaN   
2G_0      1      20026             NaN             NaN            NaN   
2H_0      1      19936             NaN             NaN            NaN   
3I_0      1      20084             NaN             NaN            NaN   
3J_0      1      20011             NaN             NaN            NaN   
3K_0      1      20117             NaN             NaN            NaN   
3L_0      1      19901             NaN             NaN            NaN   

      hetero_est  error_est  reads_consens  
1A_0 

### Do the read mapping (SE)
Here's an example cmdline run with args explained below:

smalt map -f sam -n 8 -l pp -o Arremon.sam zf-ref ../MarTum-fasta/ArremonR1.fa ../MarTum-fasta/ArremonR2.fa

* -f sams - you can also output as 'bam' but it requires installing bambamc which is explained in the smalt docs, but which seems annoying, esp cuz samtools will do it for us.
* -n sets the number of threads to 8, dramatically increases speed
* -l pp tells smalt about the orientation of the paired reads, in this case pp means both reads are on the same strand in the 5' to 3' direction, I think the second read was originally from the second strand and pyrad reverse complemented it.
* -o is the outfile
* Next is the indexed reference sequence and the files containing reads

Other options to look into:
* -y minid Filters output alignments by a threshold in the number of exactly
matching nucleotides.
* -r seed Determines how reads or mate pairs with multiple best mappings are
reported.

In [None]:
#data1.paramsdict["working_directory"]
data1.dirs

In [None]:
###############################
# This is all test junk, ignore
###############################
output = "/tmp/wat"

# Check the input files
SMALT_CMD = "check "
## the read1 demultiplexed reads file
fr1 = data1.get_params(1)+"/fastq/1A_0_R1_.gz"
#data1.smalt = "/usr/local/bin/smalt"
cmd = data1.smalt + " " + SMALT_CMD + " " + fr1
print cmd
subprocess.call(cmd, shell=True,
                     stderr=subprocess.STDOUT,
                     stdout=subprocess.PIPE)

SMALT_CMD = "map -f sam -n 8 -o " + output
## the read1 demultiplexed reads file

## TODO: I recommend using parameter descriptions rather than numbers
## in the code so it is more robust to potential reordering of parameters
fr1 = data1.get_params('working_directory')+"/fastq/1A_0_R1_.gz"

cmd = data1.smalt + " " + SMALT_CMD + " " + REFSEQ + " " + fr1
print cmd
subprocess.call(cmd, shell=True,
                     stderr=subprocess.STDOUT,
                     stdout=subprocess.PIPE)

## Get mapped and unmapped reads

First get some info about our mapping.

    samtools flagstat <yoursam>

Get only the mapped reads. 0x4 is a bitmask for 'unmapped' reads, -F means get all not this mask. In both cases -b outputs as bam

    samtools view -b -F 0x4 <your.sam> > mapped.bam

Same as above, but in this case -f means just give me the ones with this flag set.

    samtools view -b -f 0x4 <your.sam> > unmapped.bam

## 

samtools sort -T /tmp/wat -O bam test.mapped.bam > test.mapped.sorted.bam
samtools bam2fq test.mapped.sorted.bam

In [11]:
import pysam

#This is junk

print data1.muscle
print data1.vsearch
print data1.smalt
print data1.samples["1B_0"].files.edits
#bam2py("")
#pysam.view("-b", "-S", "-o") #, INDIVIDUALS_WORK_DIR+species+"/"+ind+"-"+refseq.split("/")[-1]+".bam", INDIVIDUALS_WORK_DIR+species+"/"+ind+"-"+refseq.split("/")[-1]+".sam", catch_stdout=False)
#pysam.sort( "-O", "bam", "-o", INDIVIDUALS_WORK_DIR+species+"/"+ind+"-"+refseq.split("/")[-1]+".bam", "-T", "tempfile", INDIVIDUALS_WORK_DIR+species+"/"+ind+"-"+refseq.split("/")[-1]+".bam", catch_stdout=False)
#pysam.index( INDIVIDUALS_WORK_DIR+species+"/"+ind+"-"+refseq.split("/")[-1]+".bam", catch_stdout=False)

/home/deren/Dropbox/ipyrad/bin/muscle3.8.31_i86linux64
/home/deren/Dropbox/ipyrad/bin/vsearch-1.1.3-linux-x86_64
/home/deren/Dropbox/ipyrad/bin/smalt-0.7.6-linux-x86_64
[]


### Step 3: clustering within-samples

In [72]:
## run step 3 to cluster reads within samples using vsearch
#data1.step3(preview=1) #["2H_0", "2G_0"], preview=1)
data1.step3(["1B_0", "1C_0"], preview=1)
## print the results
print data1.stats
print data1.samples["1B_0"].files.fastq[0]

Checking for reference sequence index. If it doesn't exist then create it.
This could take several minutes, but it's a one time penalty, so be patient.
None
Clustering 2 samples on 3 processors.
.
      state  reads_raw  reads_filtered  clusters_total  clusters_kept  \
1A_0      1      20099             NaN             NaN            NaN   
1B_0      3      19977           19977               1              0   
1C_0      3      20114           20114               1              0   
1D_0      1      19895             NaN             NaN            NaN   
2E_0      1      19928             NaN             NaN            NaN   
2F_0      1      19934             NaN             NaN            NaN   
2G_0      1      20026             NaN             NaN            NaN   
2H_0      1      19936             NaN             NaN            NaN   
3I_0      1      20084             NaN             NaN            NaN   
3J_0      1      20011             NaN             NaN            NaN   


### Example of plotting with _ipyrad_
There are a a few simple plotting functions in _ipyrad_ useful for visualizing results. These are in the module `ipyrad.plotting`. Below is an interactive plot for visualizing the distributions of coverages across the 12 samples in the test data set.  

In [None]:
import ipyrad as ip
import ipyrad.plotting as iplot

## reload autosaved data. In case you quit and came back 
#data1 = ip.load_dataobj("test_rad/2014_data.dataobj")

## plot for one or more selected samples
iplot.depthplot(data1, ["1A_0", "1B_0"])

## plot for all samples in data1
#iplot.depthplot(data1)

## save plot as pdf and html
iplot.depthplot(data1, outprefix="testfig")

### Step 4: Joint estimation of heterozygosity and error rate


In [None]:
## run step 4
data1.step4() #"2H_0", "2G_0")

## print the results
print data1.stats

### Step 5: Consensus base calls


In [None]:
## run step 5
data1.step5(["2H_0"])

## print the results
print data1.stats

### Quick parameter explanations are always on-hand

In [None]:
ip.get_params_info(10)

### Log history 
A common problem after struggling through an analysis is that you find you've completely forgotten what parameters you used at what point, and when you changed them. The log history time stamps all calls to `set_params()`, as well as calls to `step` methods. It also records copies/branching of data objects.  

In [None]:
for i in data1.log:
    print i

### Saving Assembly objects
Assembly objects can be saved and loaded so that interactive analyses can be started, stopped, and returned to quite easily. The format of these saved files is a serialized 'dill' object used by Python. Individual Sample objects are saved within Assembly objects. These objects to not contain the actual sequence data, but only link to it, and so are not very large. The information contained includes parameters and the log of Assembly objects, and the statistics and state of Sample objects. Assembly objects are autosaved each time an assembly `step` function is called, but you can also create your own checkpoints with the `save` command. 

In [4]:
## save assembly object
#ip.save_assembly("data1.p")

## load assembly object
data2 = ip.load_assembly("/tmp/ipyrad-test/test.assembly")
print data2.name
#print data.stats
for sample in data2.samples:
    data2.samples[sample].stats.state = 2#.state
print data2.stats
#data2.set_params(4, "/tmp/ipyrad-test/test_edits/")
#print data2
print data2.samples["1A_0"].files.clusters
#data2.step2(force=True)

test
      state  reads_raw  reads_filtered  clusters_total  clusters_kept  \
1A_0      2      20099           20099               1              0   
1B_0      2      19977           19977               1              0   
1C_0      2      20114           20114               1              0   
1D_0      2      19895           19895               1              0   
2E_0      2      19928           19928               1              0   
2F_0      2      19934           19934               1              0   
2G_0      2      20026           20026               1              0   
2H_0      2      19936           19936               1              0   
3I_0      2      20084           20084               1              0   
3J_0      2      20011           20011               1              0   
3K_0      2      20117           20117               1              0   
3L_0      2      19901           19901               1              0   

      hetero_est  error_est  reads_consens  


In [5]:
from ipyrad import assemble
data2.step3(["1A_0"], preview=True, force=True)
#assemble.cluster_within.derep_and_sort( data1, data1.samples["3L_0"], 0 )
data2.stats

Checking for reference sequence index. If it doesn't exist then create it.
This could take several minutes, but it's a one time penalty, so be patient.
None
Clustering 1 samples on 4 processors.
.




Unnamed: 0,state,reads_raw,reads_filtered,clusters_total,clusters_kept,hetero_est,error_est,reads_consens
1A_0,3,20099,20099,1,0,,,
1B_0,2,19977,19977,1,0,,,
1C_0,2,20114,20114,1,0,,,
1D_0,2,19895,19895,1,0,,,
2E_0,2,19928,19928,1,0,,,
2F_0,2,19934,19934,1,0,,,
2G_0,2,20026,20026,1,0,,,
2H_0,2,19936,19936,1,0,,,
3I_0,2,20084,20084,1,0,,,
3J_0,2,20011,20011,1,0,,,


In [None]:
assemble.cluster_within.muscle_align( data1, data1.samples["1D_0"])

## Working ipyparallel toy for testing
Most stuff below here requires this codeblock to be run, to init ipyparallel

In [92]:
import ipyparallel
print( ipyparallel.__version__)
from ipyparallel import Client
ipyclient = Client()
print(ipyclient.ids)
dview = ipyclient.load_balanced_view()
parallel_result = dview.map_async(lambda x:x**10, range(32))
print(parallel_result.get())
#print(parallel_result)
#res = dview.map_async(print, "Hello, World")
#print(res)
del dview
ipyclient.close()

4.1.0
[0, 1, 2, 3]
[0, 1, 1024, 59049, 1048576, 9765625, 60466176, 282475249, 1073741824, 3486784401, 10000000000, 25937424601, 61917364224, 137858491849, 289254654976, 576650390625, 1099511627776, 2015993900449, 3570467226624, 6131066257801, 10240000000000, 16679880978201, 26559922791424, 41426511213649, 63403380965376, 95367431640625, 141167095653376, 205891132094649, 296196766695424, 420707233300201, 590490000000000, 819628286980801]


## Toy code for testing cluster_within on a subsample

In [94]:
ipyclient = Client()
print(ipyclient.ids)
samp1="1A_0"
subsamples = []
data1.samples[samp1].stats.state = 2
subsamples.append((sample, data1.samples[samp1]))

print data1.samples[samp1].files.edits
assemble.cluster_within.run( data1, subsamples, ipyclient, True, True, True)
ipyclient.close()

[0, 1, 2, 3]
[]


CompositeError: one or more exceptions from call to method: mapreads
[3:apply]: IndexError: list index out of range

## Code for directly debugging mapreads() outside of ipyparallel

In [95]:
#Debugging smalt code in cluster_within
samp1="1A_0"
sample_obj=data1.samples[samp1]
data1.samples[samp1].stats.state = 2
sample = subsamples[0][1]
print sample
assemble.cluster_within.mapreads([data2, sample_obj, 1, 0, 4])


<ipyrad.core.sample.Sample object at 0x112d8c190>
preview: in run_full, using 4


AttributeError: No such attribute: edits

In [24]:
# Debug derep_and_sort
samp1="1A_0"
sample_obj=data2.samples[samp1]
data2.samples[samp1].stats.state = 2
assemble.cluster_within.derep_and_sort(data2, sample_obj, 1, 4)
print(sample_obj.files.edits)
handle = sample_obj.files.edits[0]
cmd = data2.vsearch+\
    " -derep_fulllength "+handle+\
    " "+\
    " -output "+os.path.join(data2.dirs.edits, sample_obj.name+".derep")+\
    " -sizeout "+\
    " -threads "+str(4)+\
    " -fasta_width 0"
print(cmd)

['/private/tmp/ipyrad-test/test_edits/1A_0.fasta']
/usr/local/opt/anaconda/lib/python2.7/site-packages/ipyrad-0.0.65-py2.7.egg/bin/vsearch-1.1.3-osx-x86_64 -derep_fulllength /private/tmp/ipyrad-test/test_edits/1A_0.fasta  -output /private/tmp/ipyrad-test/test_edits/1A_0.derep -sizeout  -threads 4 -fasta_width 0


In [46]:
import itertools
samp1="1A_0"
sample=data2.samples[samp1]
unmapped_fastq_handle=os.path.join(data2.dirs.edits, sample.name+".fasta")
with open(os.path.realpath(unmapped_fastq_handle), 'rb') as fq:
    quart1 = itertools.izip(*[iter(fq)]*4)
    quarts = itertools.izip(quart1, iter(int, 1))
    read1 = [i.strip() for i in quart[0]]
    writing = []
    while 1:
        try:
            quart = quarts.next()
        except StopIteration:
            break
        read1 = [i.strip() for i in quart[0]]
        sseq = ">"+sample.name+"_"+str(0)+\
                           "_c1\n"+read1[1]+"\n"
        writing.append(sseq)
print(sample.files.edits[0])
with open( sample.files.edits[0], 'w' ) as out:
    out.write("".join(writing))

/private/tmp/ipyrad-test/test_edits/1A_0.fasta


In [59]:
samp1="3L_0"
sample=data2.samples[samp1]
unmapped_fastq_handle=os.path.join(data2.dirs.edits, sample.name+".fastq")
print(unmapped_fastq_handle)
if True:
    writing = []
    with open(os.path.realpath(unmapped_fastq_handle), 'rb') as fq:
        quart1 = itertools.izip(*[iter(fq)]*4)
        quarts = itertools.izip(quart1, iter(int, 1))
        writing = []
        while 1:
            try:
                quart = quarts.next()
            except StopIteration:
                break
            read1 = [i.strip() for i in quart[0]]
            sseq = ">"+sample.name+"_"+str(0)+\
                           "_c1\n"+read1[1]+"\n"
            writing.append(sseq)

    with open( sample.files.edits[0], 'w' ) as out:
        out.write("".join(writing))

/private/tmp/ipyrad-test/test_edits/3L_0.fastq


In [74]:
quart=[]
quarts=[]
quart1=[]
nthreads=4
preview=True
samp1="1B_0"
sample=data1.samples[samp1]
data=data1
if True:
    samhandle = os.path.join(data.dirs.edits, sample.name+".sam")
    bamhandle = os.path.join(data.dirs.edits, sample.name+".bam")
    unmapped_fastq_handle = os.path.join(data.dirs.edits, sample.name+".fastq")

    ## get call string
    cmd = data.smalt+\
        " map -f sam -n " + str(nthreads) +\
        " -o " + samhandle +\
        " " + data.get_params(28) +\
        " " + sample.files.fastq[0]

    ## run smalt
    if preview:
        ## make this some kind of wait command that kills after a few mins
        subprocess.call(cmd, shell=True,
                             stderr=subprocess.STDOUT,
                             stdout=subprocess.PIPE)
    else:
        subprocess.call(cmd, shell=True,
                             stderr=subprocess.STDOUT,
                             stdout=subprocess.PIPE)

    cmd = data.samtools+\
        " view -b -f 0x4 "+samhandle+\
            " > " + bamhandle
    subprocess.call(cmd, shell=True,
                         stderr=subprocess.STDOUT,
                         stdout=subprocess.PIPE)

    cmd = data.samtools+\
        " sort -T "+samhandle+".tmp" +\
        " -O bam "+bamhandle+\
        " -o "+bamhandle+".sorted"
    subprocess.call(cmd, shell=True,
                         stderr=subprocess.STDOUT,
                         stdout=subprocess.PIPE)
    cmd = data.samtools+\
        " bam2fq "+bamhandle+".sorted"+\
        " > "+unmapped_fastq_handle
    subprocess.call(cmd, shell=True,
                         stderr=subprocess.STDOUT,
                         stdout=subprocess.PIPE)

    ## This is hax to get fastq to fasta to get this off the ground.
    ## samtools bam2fq natively returns fastq, you just delete this code
    ## when fastq pipleline is working
    writing = []
    with open(os.path.realpath(unmapped_fastq_handle), 'rb') as fq:
        quart1 = itertools.izip(*[iter(fq)]*4)
        quarts = itertools.izip(quart1, iter(int, 1))
        writing = []
        while 1:
            try:
                quart = quarts.next()
            except StopIteration:
                break
            read1 = [i.strip() for i in quart[0]]
            sseq = ">"+sample.name+"_"+str(0)+\
                           "_c1\n"+read1[1]+"\n"
            writing.append(sseq)

    with open( sample.files.edits[0], 'w' ) as out:
        out.write("".join(writing))

In [135]:
cmd = data.samtools+\
    " flagstat "+bamhandle+".sorted"
result = subprocess.check_output(cmd, shell=True, 
                            stderr=subprocess.STDOUT )
print(result)

#result = subprocess.check_output( cmd, shell=True, 
#                                       stderr=subprocess.STDOUT )
sample.stats.refseq_unmapped_reads=int(result.split()[0])
print(sample.stats)


13733 + 0 in total (QC-passed reads + QC-failed reads)
0 + 0 secondary
0 + 0 supplementary
0 + 0 duplicates
0 + 0 mapped (0.00% : N/A)
0 + 0 paired in sequencing
0 + 0 read1
0 + 0 read2
0 + 0 properly paired (N/A : N/A)
0 + 0 with itself and mate mapped
0 + 0 singletons (N/A : N/A)
0 + 0 with mate mapped to a different chr
0 + 0 with mate mapped to a different chr (mapQ>=5)

state                        2
reads_raw                20099
reads_filtered             NaN
clusters_total             NaN
clusters_kept              NaN
hetero_est                 NaN
error_est                  NaN
reads_consens              NaN
refseq_unmapped_reads    13733
dtype: object


In [16]:
import ipyrad as ip
TEST = ip.load_assembly("/tmp/ipyrad-test/test-refseq.assembly")
TEST.get_params()
TEST.step3( ["1A_0"], preview=True, force=True)
TEST.stats

  1   working_directory             /tmp/ipyrad-test                             
  2   raw_fastq_path                ./data/sim_rad_test_R1_.fastq.gz             
  3   barcodes_path                 ./data/sim_rad_test_barcodes.txt             
  4   sorted_fastq_path             key not recognized                           
  5   restriction_overhang          ('TGCAG', '')                                
  6   max_low_qual_bases            5                                            
  7   N_processors                  4                                            
  8   mindepth_statistical          6                                            
  9   mindepth_majrule              6                                            
  10  datatype                      rad                                          
  11  clust_threshold               0.85                                         
  12  minsamp                       4                                            
  13  max_shared

Unnamed: 0,state,reads_raw,reads_filtered,refseq_mapped_reads,refseq_unmapped_reads,clusters_total,clusters_kept
1A_0,3,20099,20099,6297,13802,25,24
1B_0,3,19977,19977,6244,13733,24,23
1C_0,3,20114,20114,6280,13834,22,22
1D_0,3,19895,19895,6263,13632,22,22
2E_0,3,19928,19928,6055,13873,23,23
2F_0,3,19934,19934,6171,13763,24,22
2G_0,3,20026,20026,5980,14046,24,24
2H_0,3,19936,19936,6052,13884,23,22
3I_0,3,20084,20084,6391,13693,23,21
3J_0,3,20011,20011,6190,13821,22,22


## Hax for testing why clustering unmapped reads returns so few clusters???

In [33]:
import gzip 
import itertools
import numpy as np

sample = TEST.samples["1A_0"]
data = TEST
#ip.assemble.cluster_within.cleanup(TEST, TEST.samples["1A_0"])
if True:
    sample.files.clusters = os.path.join(data.dirs.clusts,
                                         sample.name+".clustS.gz")

    print(sample.files.clusters)
    ## get depth stats
    infile = gzip.open(sample.files.clusters)
    duo = itertools.izip(*[iter(infile)]*2)
    depth = []
    thisdepth = []
    while 1:
        try:
            itera = duo.next()[0]
            #print(itera)
        except StopIteration:
            print("wat")
            break
        if itera != "//\n":
            thisdepth.append(int(itera.split(";")[-2][5:]))
        else:
            ## append and reset
            depth.append(sum(thisdepth))
            thisdepth = []
    infile.close()

    if depth:
        ## make sense of stats
        depth = np.array(depth)
        print(depth)
        keepmj = depth[depth >= data.paramsdict["mindepth_majrule"]]
        keepstat = depth[depth >= data.paramsdict["mindepth_statistical"]]
        ## sample assignments
        sample.stats["state"] = 3
        sample.stats["clusters_total"] = len(depth)
        sample.stats["clusters_kept"] = max([len(i) for i in \
                                             (keepmj, keepstat)])
        sample.depths.total = depth
        sample.depths.mjmin = keepmj
        sample.depths.statmin = keepstat

        data.stamp("s3 clustering on "+sample.name)

/private/tmp/ipyrad-test/test-refseq_clust_0.85/1A_0.clustS.gz
wat
[18 21 20 18 24 23 23 20 25 20 20 23 24 17 21 20 20 19 17 18 20 18 17 23 16
 24 24 19 20 21 22 25 20 19 22 21 12 18 24 20 20 22 20 16 16 17 18 20 19 17
 19 19 20 20 19 18 20 21 17 19 21 23 19 19 21 17 21 28 21 20 18 19 23 18 22
 21 23 16 17 20 16 22 15 19 17 19 20 22 22 26 19 24 24 22 13 18 20 19 21 17
 20 19 18 17 16 22 18 19 17 22 19 22 19 18 24 20 21 15 21 16 17 21 18 15 24
 19 21 17 20 22 18 26 23 24 22 22 21 23 21 21 26 17 22 22 18 19 25 23 19 23
 26 18 20 20 22 24 16 23 20 24 24 21 17 20 18 21 20 22 21 20 18 20 23 19 20
 18 21 22 22 16 22 21 22 19 20 25 16 22 22 18 24 19 17 22 23 20 21 17 21 17
 19 22 19 14 21 19 19 17 18 17 21 24 19 22 22 20 19 21 20 19 20 17 20 18 19
 23 23 23 15 20 22 18 16 19 22 18 23 22 18 18 22 23 24 20 20 21 18 26 19 19
 20 19 18 22 21 17 19 22 21 20 22 18 22 25 23 16 21 18 22 21 21 21 23 19 21
 20 22 22 17 23 23 16 19 18 23 18 20 23 19 15 25 20 24 21 24 19 18 17 18 19
 19 21 16 18 17 19 21