### Building the geno output

In [9]:
import h5py 
import numpy as np
import ipyrad as ip
import ipyparallel as ipp
from ipyrad.assemble.write_outfiles import *

In [10]:
data = ip.load_json("cli/cli.json")

  loading Assembly: cli
  from saved path: ~/Documents/ipyrad/tests/cli/cli.json


In [24]:
samples = data.samples.values()
sidx = [1 for i in samples]


In [13]:
ipyclient = ipp.Client()
ipyclient.ids

[0, 1, 2, 3]

In [14]:
## prepare dirs
data.dirs.outfiles = os.path.join(data.dirs.project, data.name+"_outfiles")
if not os.path.exists(data.dirs.outfiles):
    os.mkdir(data.dirs.outfiles)

## make the snps/filters data base, fills the dups and inds filters
## and fills the splits locations
data.database = os.path.join(data.dirs.outfiles, data.name+".hdf5")
init_arrays(data)

In [15]:
filter_all_clusters(data, samples, ipyclient)

## Everything needed is in the now filled h5 database. Filters were applied
## with 'samples' taken into account. Now we create the loci file (default)
## output and build a stats file. 
data.outfiles.loci = os.path.join(data.dirs.outfiles, data.name+".loci")
make_loci_and_stats(data, samples, ipyclient)

## OPTIONAL OUTPUTS:
output_formats = data.paramsdict["output_formats"]

  [####################] 100%  filtering loci        | 0:00:06 | s7 | 
  [####################] 100%  building loci/stats   | 0:00:00 | s7 | 


In [16]:
## held separate from *output_formats cuz it's big and parallelized 
if any([x in output_formats for x in ["v", "V"]]):
    full = "V" in output_formats
    try:
        make_vcf(data, samples, ipyclient, full=full)
    except IPyradWarningExit as inst:
        ## Something fsck vcf build. Sometimes this is simply a memory
        ## issue, so trap the exception and allow it to try building
        ## the other output formats.
        print("  Error building vcf. See ipyrad_log.txt for details.")

  [####################] 100%  building vcf file     | 0:00:02 | s7 | 
  [####################] 100%  writing vcf file      | 0:00:00 | s7 | 


In [21]:
start = 0
optim = 100

maxlen = data._hackersonly["max_fragment_length"] + 20

## get data sliced (optim chunks at a time)
hslice = [start, start+optim]

## read all taxa from disk (faster), then subsample taxa with sidx and
## keepmask to greatly reduce the memory load
with h5py.File(data.database, 'r') as co5:
    afilt = co5["filters"][hslice[0]:hslice[1], :]
    keepmask = afilt.sum(axis=1) == 0
    ## apply mask to edges
    aedge = co5["edges"][hslice[0]:hslice[1], :]
    aedge = aedge[keepmask, :]
del afilt

In [25]:
with h5py.File(data.clust_database, 'r') as io5:
    ## apply mask to edges to aseqs and acatg
    aseqs = io5["seqs"][hslice[0]:hslice[1], :, :].view(np.uint8)
    aseqs = aseqs[keepmask, :]
    aseqs = aseqs[:, sidx, :]
    acatg = io5["catgs"][hslice[0]:hslice[1], :, :, :]
    acatg = acatg[keepmask, :]
    acatg = acatg[:, sidx, :, :]

In [41]:
acatg.shape

(100, 12, 116, 4)

In [195]:
io5 = h5py.File(data.clust_database, 'r')
co5 = h5py.File(data.database, 'r')

## will iterate optim loci at a time
optim = io5["seqs"].attrs["chunksize"][0]
nloci = io5["seqs"].shape[0]

## get name and snp padding
anames = io5["seqs"].attrs["samples"]
snames = [i.name for i in samples]
## get only snames in this data set sorted in the order they are in io5
names = [i for i in anames if i in snames]
pnames, _ = padnames(names)
#pnames.sort()

## get names boolean
sidx = np.array([i in snames for i in anames])
assert len(pnames) == sum(sidx)

## get names index in order of pnames
#sindx = [list(anames).index(i) for i in snames]

## build arrays and outputs from arrays.
## TODO, don't block during make-arrays
arrs = make_arrays(data, sidx, optim, nloci, io5, co5)
seqarr, snparr, bisarr, maparr = arrs

In [196]:
snparr

array([['T', 'T', 'T', ..., 'C', 'G', 'A'],
       ['T', 'T', 'T', ..., 'C', 'G', 'A'],
       ['T', 'T', 'T', ..., 'C', 'G', 'A'],
       ..., 
       ['T', 'C', 'T', ..., 'C', 'G', 'A'],
       ['T', 'T', 'T', ..., 'C', 'G', 'G'],
       ['A', 'T', 'T', ..., 'C', 'G', 'A']], 
      dtype='|S1')

In [197]:
snpref

array([['T', 'C', 'A', ''],
       ['T', 'C', '', ''],
       ['T', 'A', '', ''],
       ..., 
       ['C', 'A', '', ''],
       ['G', 'T', '', ''],
       ['A', 'G', '', '']], 
      dtype='|S1')

In [198]:
snpref = reftrick(snparr.view(np.int8), GETCONS).view("S1")
bisref = reftrick(bisarr.view(np.int8), GETCONS).view("S1")


In [245]:
## I order them by same order as in .loci, which is alphanumeric
snpgeno = np.zeros(snparr.shape, dtype=np.uint8)
snpgeno.fill(9)

In [246]:
print snparr[0, :].shape
print snpref[:, 0].shape

(4018,)
(4018,)


In [248]:
## fill in complete hits
mask2 = np.array(snparr == snpref[:, 0])#, dtype=np.int)
snpgeno[mask2] = 2
snpgeno

array([[2, 2, 2, ..., 2, 2, 2],
       [2, 2, 2, ..., 2, 2, 2],
       [2, 2, 2, ..., 2, 2, 2],
       ..., 
       [2, 9, 2, ..., 2, 2, 2],
       [2, 2, 2, ..., 2, 2, 9],
       [9, 2, 2, ..., 2, 2, 2]], dtype=uint8)

In [249]:
## fill in single hits (heteros)
ambref = np.apply_along_axis(lambda x: TRANS[tuple(x)], 1, snpref[:, :2])
mask1 = np.array(snparr == ambref)
snpgeno[mask1] = 1
snpgeno

array([[2, 2, 2, ..., 2, 2, 2],
       [2, 2, 2, ..., 2, 2, 2],
       [2, 2, 2, ..., 2, 2, 2],
       ..., 
       [2, 9, 2, ..., 2, 2, 2],
       [2, 2, 2, ..., 2, 2, 9],
       [9, 2, 2, ..., 2, 2, 2]], dtype=uint8)

In [250]:
## fill in zero hits (match to second base)
mask0 = np.array(snparr == snpref[:, 1])#, dtype=np.int)
snpgeno[mask0] = 0
snpgeno

array([[2, 2, 2, ..., 2, 2, 2],
       [2, 2, 2, ..., 2, 2, 2],
       [2, 2, 2, ..., 2, 2, 2],
       ..., 
       [2, 0, 2, ..., 2, 2, 2],
       [2, 2, 2, ..., 2, 2, 0],
       [9, 2, 2, ..., 2, 2, 2]], dtype=uint8)

In [None]:
## remove those with a third base


In [251]:
np.savetxt(sys.stdout, snpgeno[:, :20].T, delimiter="", fmt="%d")
print ''
np.savetxt(sys.stdout, snparr[:, :20].T, delimiter="", fmt="%s")


222200202229
222222222022
222220222222
222022222222
222022222222
222222222122
222222202222
222222222202
222222122222
022222222222
222022222222
000022222222
222220222222
222222212222
222222212222
221222222222
222222220000
222222222002
222222222002
222222222220

TTTTCCTCTTTA
TTTTTTTTTCTT
TTTTTATTTTTT
GGGTGGGGGGGG
AAAGAAAAAAAA
GGGGGGGGGSGG
CCCCCCCTCCCC
CCCCCCCCCCAC
GGGGGGKGGGGG
CAAAAAAAAAAA
CCCTCCCCCCCC
TTTTCCCCCCCC
GGGGGCGGGGGG
CCCCCCCMCCCC
TTTTTTTWTTTT
AAWAAAAAAAAA
AAAAAAAAGGGG
AAAAAAAAATTA
TTTTTTTTTAAT
GGGGGGGGGGGT
