In [1]:
import ipyrad as ip
from ipyrad.assemble.cluster_across import *
import ipyparallel as ipp

In [2]:
ipyclient = ipp.Client()
len(ipyclient)

40

In [3]:
data = ip.load_json("analysis-ipyrad/aligntest.json")
samples = data.samples.values()
noreverse = False
force = True
randomseed = 12345

  loading Assembly: aligntest
  from saved path: ~/Documents/ipyrad/tests/analysis-ipyrad/aligntest.json


### RUN

### Step 7

In [4]:
from ipyrad.assemble.write_outfiles import *

In [5]:
## prepare dirs
data.dirs.outfiles = os.path.join(data.dirs.project, data.name+"_outfiles")
if not os.path.exists(data.dirs.outfiles):
    os.mkdir(data.dirs.outfiles)

## make the snps/filters data base, fills the dups and inds filters
## and fills the splits locations
data.database = os.path.join(data.dirs.outfiles, data.name+".hdf5")
init_arrays(data)

In [20]:
%%timeit
## grab super seqs and upper it
with h5py.File(data.clust_database) as io5:
    superints = io5["seqs"][0:100000, 1, :].view(np.int8)
    #print(superints.shape)
    #print(superints[0:3])


1 loop, best of 3: 641 ms per loop


In [26]:
#%%timeit
with h5py.File(data.clust_database) as io5:
    #superints = np.char.upper(io5["seqs"][0:100000, 1,]).view(np.int8)
    superints = io5["seqs"][0:100000, 1, :]#.view(np.int8)
    mask = np.char.islower(superints)

In [27]:
np.char.upper(superints)#.view(np.int8)

array([['T', 'G', 'C', ..., 'N', 'N', 'N'],
       ['T', 'G', 'C', ..., 'N', 'N', 'N'],
       ['T', 'G', 'C', ..., 'N', 'N', 'N'],
       ..., 
       ['T', 'G', 'C', ..., 'N', 'N', 'N'],
       ['N', 'N', 'N', ..., 'N', 'N', 'N'],
       ['N', 'N', 'N', ..., 'N', 'N', 'N']], 
      dtype='|S1')

In [6]:
filter_all_clusters(data, samples, ipyclient)

  [####################] 100%  filtering loci        | 0:00:09 | s7 | 


In [29]:
with h5py.File(data.clust_database, 'r') as io5:
    optim = io5["seqs"].attrs["chunksize"][0]
    nloci = io5["seqs"].shape[0]

    ## get name and snp padding
    anames = io5["seqs"].attrs["samples"]
    snames = [i.name for i in samples]
    ## get only snames in this data set sorted in the order they are in io5
    names = [i for i in anames if i in snames]
    pnames, _ = padnames(names)
    
sidx = np.array([i in snames for i in anames])

In [40]:
range(0, 50, 21)

[0, 21, 42]

In [41]:
## start vcf progress bar
start = time.time()
elapsed = datetime.timedelta(seconds=int(time.time()-start))
progressbar(20, 0, " building loci/stats   | {} | s7 |".format(elapsed))

## get some db info
with h5py.File(data.clust_database, 'r') as io5:
    ## will iterate optim loci at a time
    optim = io5["seqs"].attrs["chunksize"][0]
    nloci = io5["seqs"].shape[0]
    anames = io5["seqs"].attrs["samples"]

## get name and snp padding
pnames, snppad = padnames(anames)
snames = [i.name for i in samples]
smask = np.array([i not in snames for i in anames])

## keep track of how many loci from each sample pass all filters
samplecov = np.zeros(len(anames), dtype=np.int32)

## set initial value to zero for all values above min_samples_locus
#for cov in range(data.paramsdict["min_samples_locus"], len(anames)+1):
locuscov = Counter()
for cov in range(len(anames)+1):
    locuscov[cov] = 0

## client for sending jobs to parallel engines
lbview = ipyclient.load_balanced_view()

## send jobs in chunks
loci_asyncs = {}
for istart in xrange(0, nloci, optim):
    args = [data, optim, pnames, snppad, smask, istart, samplecov, locuscov]
    loci_asyncs[istart] = lbview.apply(locichunk, args)


  [                    ]   0%  building loci/stats   | 0:00:00 | s7 | 

In [42]:
while 1:
    done = [i.ready() for i in loci_asyncs.values()]
    elapsed = datetime.timedelta(seconds=int(time.time()-start))
    progressbar(len(done), sum(done),
        " building loci/stats   | {} | s7 |".format(elapsed))
    time.sleep(0.1)
    if len(done) == sum(done):
        print("")
        break

## check for errors
for job in loci_asyncs:
    if loci_asyncs[job].ready() and not loci_asyncs[job].successful():
        LOGGER.error("error in building loci [%s]: %s",
                     job, loci_asyncs[job].exception())
        raise IPyradWarningExit(loci_asyncs[job].exception())

  [####################] 100%  building loci/stats   | 0:00:20 | s7 | 


In [43]:
## concat and cleanup
results = [i.get() for i in loci_asyncs.values()]
#results.sort(key=[int(i) for i in loci_asyncs])
## update dictionaries
for chunk in results:
    samplecov += chunk[0]
    locuscov.update(chunk[1])


In [44]:
## get all chunk files
tmploci = glob.glob(data.outfiles.loci+".[0-9]*")
## sort by start value
tmploci.sort(key=lambda x: int(x.split(".")[-1]))

In [47]:
tmploc = tmploci[0]

In [56]:
alleles = []

with open(tmploc) as inloc:
    data = inloc.read().split("|\n")
    for loc in data[:10]:
        lines = loc.split("\n")
        ldata = lines[:-1]
        snps = lines[-1]
        
        print ldata

['29154_superba          TCTGGTCCCGCGGGTGATCAAGGCCCCACCACCGCGTCTCACATTTTCGATCTCAGGCGGTCTT', '30556_thamno           TCCGGTCCCGCGGGTGATCAAGGCCCCACCACCGCGTCTCACATTCTAGATCTCAGGCGGTCTT', '30686_cyathophylla     TCCAGTCCCGCGGGTGATCAAGGCCCCACCACCGCATCTCACATTCTCGATCTCAGGCGGTCTT', '33413_thamno           TCCGGTCCTTCGGGTGATCAAGGCCCCACCACCGCGTCTCACATTCTAGATCTCAGGCGGTCTT']
['29154_superba          AATGATGGTGGTACACATATTAATTACAATTTGGACAAC', '30556_thamno           ACAGATGGTGGTACACATGTCAATTACAATTTGGATAAC', '30686_cyathophylla     AATGATGGTGGTACACATATTAATTACAATTTGGACAAC', '33413_thamno           AGTGATGGTGGTACACATGTCNANTACAATTTGGACAAC']
['29154_superba          CATTAATCAGC-AAAAAAACACTCACTTTAAAGAAAAATGAATAACTCCAACAGCATGAGCTAC', '30556_thamno           CATTAATCAGC-AAAAAAATACTCACTTTAAAG-AAAATGAATAACTCCAACAGCATGAGCTAC', '30686_cyathophylla     CATTAATCAGCAAAAAAAACACTCACTTTAAAG-AAAATGAATAACTCCAACAGCATGAGCTAC', '33413_thamno           CATTAATCAGC-AAAAAAATACTCACTTTAAAG-AAAATGAATAACACCAACAGCATGAGCTAC']
['291

In [None]:
## write tmpchunks to locus file
with open(data.outfiles.loci, 'w') as locifile:
    for tmploc in tmploci:
        with open(tmploc, 'r') as inloc:
            locifile.write(inloc.read())
        os.remove(tmploc)