In [None]:
%pylab inline

# sourmash algorithms and implementation

topics:
    
* modulo hash, "density hashing",
    * https://github.com/dib-lab/sourmash/issues/823
    * https://github.com/dib-lab/sourmash/issues/606
    * https://github.com/richarddurbin/modimizer
* 'scaled' implementation, and using scaled hashes for Jaccard similarity and containment calculations


In [7]:
import screed

def kmerize_seq(sequence, k=31):
    sequence = sequence.upper()
    for start in range(0, len(sequence) - k + 1):
        kmer = sequence[start:start+k]
        
        # canonicalize the k-mer
        revcomp = screed.rc(kmer)
        if kmer < revcomp:
            yield kmer
        else:
            yield revcomp
            
#list(kmerize('ATGGACAGAGATG', k=2))

def kmerize_file(filename, k=31):
    # walk through every record in a FASTA/FASTQ file
    for record in screed.open(filename):
        # k-merize each sequence
        for kmer in kmerize_seq(record.sequence[:50000]):
            # return canonical version of each k-mer
            yield kmer

len(set(kmerize_file('data/genomes/2.fa')))



49970

In [8]:
import mmh3

In [25]:
import mmh3

# implementation of a minhash bottom sketch, where a fixed number
# of hashes is kept for each input set of k-mers.

# there's a bug in this code where we are not handling duplicate k-mers
# properly.
def minhash(kmers, num=500):
    basket = []
    for kmer in kmers:
        hashval = mmh3.hash64(kmer, seed=42)[0]
        if hashval < 0:
            hashval += 2**64
        
        # basket not full? add hash value
        if len(basket) < num:
            basket.append(hashval)
            basket = list(sorted(basket))
        # basket full?
        elif len(basket) == num:
            if basket[-1] > hashval:
                # this hashvalue does not belong in basket
                pass
            else:
                # is new hash value less than largest? if so, evict.
                basket.pop()
                basket.append(hashval)
                basket = list(sorted(basket))
                
    return basket

b_all = set(kmerize_file('data/genomes/2.fa'))
c_all = set(kmerize_file('data/genomes/47.fa'))
d_all = set(kmerize_file('data/genomes/63.fa'))


In [29]:
b = minhash(b_all, num=1000)
c = minhash(c_all, num=1000)
d = minhash(d_all, num=1000)


In [30]:
print(len(b_all), len(c_all), len(d_all))
print(len(b), len(c), len(d))

49970 96808 169296
1000 1000 1000


In [31]:
print(type(b_all))
print(type(b))

<class 'set'>
<class 'list'>


In [32]:
def jaccard_similarity(x, y):
    x = set(x)
    intersection = x.intersection(y)
    union = x.union(y)
    return len(intersection) / len(union)

print('b_all to b_all', jaccard_similarity(b_all, b_all))
print('b to b', jaccard_similarity(b, b))

print('b_all to c_all', jaccard_similarity(b_all, c_all))
print('b to c', jaccard_similarity(b, c))

print('c_all to d_all', jaccard_similarity(c_all, d_all))
print('c to d', jaccard_similarity(c, d))

b_all to b_all 1.0
b to b 1.0
b_all to c_all 0.0
b to c 0.0
c_all to d_all 0.2544146624303506
c to d 0.1607661056297156


In [33]:
# now instead of a minhash, implement a scaled sketch, which is what
# we recommend using in sourmash.

# scaled here is 1/f of k-mers that we are going to keep
# this is a sampling mechanism where we "stochastically" sample
# by randomizing the order of k-mers with a hash function, and
# then choosing a subset of them deterministically.

def scaledhash(kmers, scaled=1000):
    basket = set()
    
    MAX_HASH=2**64 - 1
    boundary = MAX_HASH / scaled
    
    for kmer in kmers:
        hashval = mmh3.hash64(kmer, seed=42)[0]
        if hashval < 0:
            hashval += 2**64
            
        # accept any k-mer that hashes into the bottom 1/scaled of the
        # hash space. what this should do (for a good hash function)
        # is pick approximately 1 in 'scaled' of the input k-mers.
        if hashval < boundary:
            basket.add(hashval)

    return basket
        
b_sh = scaledhash(b_all, scaled=1000)
c_sh = scaledhash(c_all, scaled=1000)
d_sh = scaledhash(d_all, scaled=1000)


In [34]:
print('b_all to b_all', jaccard_similarity(b_all, b_all))
print('b to b', jaccard_similarity(b, b))
print('b_sh to b_sh', jaccard_similarity(b_sh, b_sh))


print('b_all to c_all', jaccard_similarity(b_all, c_all))
print('b to c', jaccard_similarity(b, c))
print('b_sh to c_sh', jaccard_similarity(b_sh, c_sh))

print('c_all to d_all', jaccard_similarity(c_all, d_all))
print('c to d', jaccard_similarity(c, d))
print('c_sh to dsh', jaccard_similarity(c_sh, d_sh))

b_all to b_all 1.0
b to b 1.0
b_sh to b_sh 1.0
b_all to c_all 0.0
b to c 0.0
b_sh to c_sh 0.0
c_all to d_all 0.2544146624303506
c to d 0.1607661056297156
c_sh to dsh 0.1889400921658986



* 'scaled' implementation, and using scaled hashes for Jaccard similarity and containment calculations
    * with a scaled of 10,000, you get approximately 1 hash per every 10kb of sequence
    * see [graph at 10kb](https://github.com/dib-lab/charcoal/blob/master/stats/stats10k.png), [graph at 5kb](https://github.com/dib-lab/charcoal/blob/master/stats/stats5k.png)
    * (similar statistics apply on a stream of distinct k-mers)


* looking in more depth at the signatures
* manipulating signatures (command line, Python, etc.)


In [36]:
!jq < 2.fa.sig

[1;39m[
  [1;39m{
    [0m[34;1m"class"[0m[1;39m: [0m[0;32m"sourmash_signature"[0m[1;39m,
    [0m[34;1m"email"[0m[1;39m: [0m[0;32m""[0m[1;39m,
    [0m[34;1m"hash_function"[0m[1;39m: [0m[0;32m"0.murmur64"[0m[1;39m,
    [0m[34;1m"filename"[0m[1;39m: [0m[0;32m"data/genomes/2.fa"[0m[1;39m,
    [0m[34;1m"name"[0m[1;39m: [0m[0;32m"CP001071.1 Akkermansia muciniphila ATCC BAA-835, complete genome"[0m[1;39m,
    [0m[34;1m"license"[0m[1;39m: [0m[0;32m"CC0"[0m[1;39m,
    [0m[34;1m"signatures"[0m[1;39m: [0m[1;39m[
      [1;39m{
        [0m[34;1m"num"[0m[1;39m: [0m[0;39m0[0m[1;39m,
        [0m[34;1m"ksize"[0m[1;39m: [0m[0;39m21[0m[1;39m,
        [0m[34;1m"seed"[0m[1;39m: [0m[0;39m42[0m[1;39m,
        [0m[34;1m"max_hash"[0m[1;39m: [0m[0;39m18446744073709550[0m[1;39m,
        [0m[34;1m"mins"[0m[1;39m: [0m[1;39m[
          [0;39m9253370411757[0m[1;39m,
          [0;39m22055233311447[0m[1

          [0;39m11106426173096134[0m[1;39m,
          [0;39m11111008680574732[0m[1;39m,
          [0;39m11112971544804072[0m[1;39m,
          [0;39m11117142906970704[0m[1;39m,
          [0;39m11122887944491864[0m[1;39m,
          [0;39m11123134489672404[0m[1;39m,
          [0;39m11124313190051970[0m[1;39m,
          [0;39m11131274045814432[0m[1;39m,
          [0;39m11134768830434580[0m[1;39m,
          [0;39m11137871193966872[0m[1;39m,
          [0;39m11139439324849368[0m[1;39m,
          [0;39m11145533100200114[0m[1;39m,
          [0;39m11150515087815276[0m[1;39m,
          [0;39m11156384085429662[0m[1;39m,
          [0;39m11157892369956176[0m[1;39m,
          [0;39m11161009408719874[0m[1;39m,
          [0;39m11166211799466712[0m[1;39m,
          [0;39m11170717557392304[0m[1;39m,
          [0;39m11184536722543476[0m[1;39m,
          [0;39m11187525741338652[0m[1;39m,
          [0;39m11226007334467712[0m[

          [0;39m5703741162189807[0m[1;39m,
          [0;39m5706535934081315[0m[1;39m,
          [0;39m5707128466246152[0m[1;39m,
          [0;39m5717731977260371[0m[1;39m,
          [0;39m5719633091855226[0m[1;39m,
          [0;39m5720510963194346[0m[1;39m,
          [0;39m5729579279104706[0m[1;39m,
          [0;39m5729764028324178[0m[1;39m,
          [0;39m5740630525074950[0m[1;39m,
          [0;39m5741129486732745[0m[1;39m,
          [0;39m5745453911718457[0m[1;39m,
          [0;39m5749789577819561[0m[1;39m,
          [0;39m5752026401606325[0m[1;39m,
          [0;39m5760229237068668[0m[1;39m,
          [0;39m5762776817176351[0m[1;39m,
          [0;39m5767566225687055[0m[1;39m,
          [0;39m5770214575183328[0m[1;39m,
          [0;39m5771595211018276[0m[1;39m,
          [0;39m5781390291816527[0m[1;39m,
          [0;39m5804083696129527[0m[1;39m,
          [0;39m5808000639892486[0m[1;39m,
          [0

          [0;39m3821370728424399[0m[1;39m,
          [0;39m3831091291497756[0m[1;39m,
          [0;39m3833594319775588[0m[1;39m,
          [0;39m3836070293893334[0m[1;39m,
          [0;39m3836463726302801[0m[1;39m,
          [0;39m3843454534654819[0m[1;39m,
          [0;39m3865330015850533[0m[1;39m,
          [0;39m3865731389902624[0m[1;39m,
          [0;39m3868815096902378[0m[1;39m,
          [0;39m3874152187538165[0m[1;39m,
          [0;39m3879504507792185[0m[1;39m,
          [0;39m3881012401310194[0m[1;39m,
          [0;39m3890928823553932[0m[1;39m,
          [0;39m3894340812924690[0m[1;39m,
          [0;39m3910525773502689[0m[1;39m,
          [0;39m3913041691319282[0m[1;39m,
          [0;39m3915621729619021[0m[1;39m,
          [0;39m3919374357408690[0m[1;39m,
          [0;39m3957513182725671[0m[1;39m,
          [0;39m3959863515871547[0m[1;39m,
          [0;39m3964876524010791[0m[1;39m,
          [0

          [0;39m14745918381531036[0m[1;39m,
          [0;39m14747781274261754[0m[1;39m,
          [0;39m14769235813387444[0m[1;39m,
          [0;39m14774506216866144[0m[1;39m,
          [0;39m14775858199845320[0m[1;39m,
          [0;39m14777438862619136[0m[1;39m,
          [0;39m14785706151846816[0m[1;39m,
          [0;39m14797080542518334[0m[1;39m,
          [0;39m14808067053963172[0m[1;39m,
          [0;39m14815595921590586[0m[1;39m,
          [0;39m14816883728169456[0m[1;39m,
          [0;39m14819323611785348[0m[1;39m,
          [0;39m14819840263469102[0m[1;39m,
          [0;39m14827625495688744[0m[1;39m,
          [0;39m14839029876559500[0m[1;39m,
          [0;39m14850591011118274[0m[1;39m,
          [0;39m14861741509747314[0m[1;39m,
          [0;39m14892690229161784[0m[1;39m,
          [0;39m14894257791973640[0m[1;39m,
          [0;39m14902573036352340[0m[1;39m,
          [0;39m14913146370699988[0m[

          [0;39m17825363186790720[0m[1;39m,
          [0;39m17839438010001086[0m[1;39m,
          [0;39m17854602718146652[0m[1;39m,
          [0;39m17859786785776348[0m[1;39m,
          [0;39m17877311442819478[0m[1;39m,
          [0;39m17905061698046516[0m[1;39m,
          [0;39m17915900664445324[0m[1;39m,
          [0;39m17921723325622658[0m[1;39m,
          [0;39m17927897596423856[0m[1;39m,
          [0;39m17945154459135646[0m[1;39m,
          [0;39m17946171820546868[0m[1;39m,
          [0;39m17963522321206264[0m[1;39m,
          [0;39m17968127380401234[0m[1;39m,
          [0;39m17969920306213012[0m[1;39m,
          [0;39m17970588515711704[0m[1;39m,
          [0;39m17971561735516464[0m[1;39m,
          [0;39m17979603471308842[0m[1;39m,
          [0;39m17981246765331158[0m[1;39m,
          [0;39m17991312557003936[0m[1;39m,
          [0;39m17993515278583148[0m[1;39m,
          [0;39m17994374617688536[0m[

## repo at github.com/ctb/2020-jgi-tech-talk

In [40]:
import sourmash
sig2 = sourmash.load_one_signature('2.fa.sig', ksize=31)
sig63 = sourmash.load_one_signature('63.fa.sig', ksize=31)
sig47 = sourmash.load_one_signature('47.fa.sig', ksize=31)

In [42]:
sig63.similarity(sig47)

0.3206949023586102

In [43]:
a = set(sig63.minhash.get_mins())
b=  set(sig47.minhash.get_mins())

jaccard_similarity(a, b)

0.3206949023586102

In [44]:
!sourmash search 47.fa.sig 63.fa.sig

[K
== This is sourmash version 3.2.4.dev5+g6484e78f. ==
[K== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==

[Kselecting default query k=31.
[Kloaded query: NC_009665.1 Shewanella baltica... (k=31, DNA)
[Kloaded 1 signatures.                                                           

1 matches:
similarity   match
----------   -----
 32.1%       NC_011663.1 Shewanella baltica OS223, complete genome


* downsampling in particular
* ...hash matches do indeed correspond to nucleotide alignments!
* downsides of scaled approach:
    * don't work for small genomes
    * arbitrary growth in size (so may be unnecessarily large for large genomes!)
    
tl;dr sourmash is basically many thousands of lines of code around doing this stuff with reasonable efficiency and care.

In [45]:
sig63.similarity(sig47)

0.3206949023586102

In [46]:
sig63.contained_by(sig47)

0.48281786941580757

The scaledhash represents a compositional approach to sketching.

In [47]:
d_sh

{71315379183989,
 189372881188038,
 312508185357838,
 385242214102144,
 395356088720884,
 444196295289556,
 457671098462976,
 623564286058606,
 717098217279451,
 736734785664407,
 792800620274218,
 1075576974119467,
 1127740241225995,
 1278259882696625,
 1339674524726757,
 1432423484009425,
 1466391743844010,
 1505096985478881,
 1535451811798960,
 1596940098556068,
 1642353838656017,
 1658104204669725,
 1669812089421733,
 1719016742757524,
 1800236363852916,
 1911464021342251,
 1977965122461301,
 1996672784729607,
 2154825108832254,
 2296330477067902,
 2359599974602456,
 2556506799873846,
 2566619437831302,
 2742141972024096,
 2798287443905216,
 2823296594306435,
 2947035710205399,
 3009879725684408,
 3056598953438835,
 3101693309309556,
 3103723994300125,
 3173337239261818,
 3315214714717367,
 3320924500756368,
 3321523572005253,
 3549947024705822,
 3675775119596083,
 3863738703644495,
 4421125787216995,
 4473513246905919,
 4499856296475689,
 4501876131664304,
 4607450449118254,
 4613

In [57]:
# note you can downsample scaled hash signatures without going back to 
# the raw data.
def downsample_scaledhash(input_basket, scaled=1000):
    basket = set()
    
    MAX_HASH=2**64 - 1
    boundary = MAX_HASH / scaled
    
    for hashval in input_basket:
        if hashval < 0:
            hashval += 2**64
            
        # accept any k-mer that hashes into the bottom 1/scaled of the
        # hash space. what this should do (for a good hash function)
        # is pick approximately 1 in 'scaled' of the input k-mers.
        if hashval < boundary:
            basket.add(hashval)

    return basket


In [53]:
b_sh2 = downsample_scaledhash(b_sh, 1000)
b_sh2 == b_sh
b_sh3 = downsample_scaledhash(b_sh, 10000)

In [54]:
len(b_sh3)

9

In [55]:
len(b_sh2)

52